List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:com.mathworks.xzheng.advsearching.CategorizerTest.java
License:Apache License
private void buildCategoryVectors() throws IOException { IndexReader reader = IndexReader.open(TestUtil.getBookIndexDirectory()); int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { if (!reader.document(i)) { Document doc = reader.document(i); String category = doc.get("category"); Map vectorMap = (Map) categoryMap.get(category); if (vectorMap == null) { vectorMap = new TreeMap(); categoryMap.put(category, vectorMap); }//from w ww . java 2s . c o m Terms terms = reader.getTermVector(i, "subject"); addTermFreqToMap(vectorMap, terms); } } }
From source file:com.mathworks.xzheng.advsearching.FunctionQueryTest.java
License:Apache License
public void testRecency() throws Throwable { Directory dir = TestUtil.getBookIndexDirectory(); IndexReader r = IndexReader.open(dir); IndexSearcher s = new IndexSearcher(r); s.setDefaultFieldSortScoring(true, true); QueryParser parser = new QueryParser(Version.LUCENE_46, "contents", new StandardAnalyzer(Version.LUCENE_46)); Query q = parser.parse("java in action"); // #A Query q2 = new RecencyBoostingQuery(q, // #B 2.0, 2 * 365, "pubmonthAsDay"); Sort sort = new Sort(new SortField[] { SortField.FIELD_SCORE, new SortField("title2", SortField.STRING) }); TopDocs hits = s.search(q2, null, 5, sort); for (int i = 0; i < hits.scoreDocs.length; i++) { Document doc = r.document(hits.scoreDocs[i].doc); System.out.println((1 + i) + ": " + doc.get("title") + ": pubmonth=" + doc.get("pubmonth") + " score=" + hits.scoreDocs[i].score); }//w w w . j a v a 2 s . com s.close(); r.close(); dir.close(); }
From source file:com.mathworks.xzheng.tools.BooksMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Throwable { String indexDir = System.getProperty("index.dir"); FSDirectory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); int numDocs = reader.maxDoc(); MoreLikeThis mlt = new MoreLikeThis(reader); // #A mlt.setFieldNames(new String[] { "title", "author" }); mlt.setMinTermFreq(1); // #B mlt.setMinDocFreq(1);/*from w w w. j a v a 2 s . c om*/ for (int docID = 0; docID < numDocs; docID++) { // #C System.out.println(); Document doc = reader.document(docID); System.out.println(doc.get("title")); Query query = mlt.like(docID); // #D System.out.println(" query=" + query); TopDocs similarDocs = searcher.search(query, 10); if (similarDocs.totalHits == 0) System.out.println(" None like this"); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { if (similarDocs.scoreDocs[i].doc != docID) { // #E doc = reader.document(similarDocs.scoreDocs[i].doc); System.out.println(" -> " + doc.getField("title").stringValue()); } } } reader.close(); directory.close(); }
From source file:com.redhat.satellite.search.index.IndexManager.java
License:Open Source License
/** * Removes any documents which are not related to the passed in Set of good value * @param ids Set of ids of all known/good values * @param indexName index name to operate on * @param uniqField the name of the field in the Document to uniquely identify * this record// w w w. ja v a 2 s. co m * @return the number of documents deleted */ public int deleteRecordsNotInList(Set<String> ids, String indexName, String uniqField) { int count = 0; IndexReader reader = null; try { reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG); // Use maxDoc() to iterate over all docs, numDocs() returns the // number of currently alive docs leaving out the deleted ones. int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { if (!reader.isDeleted(i)) { Document doc = reader.document(i); String uniqId = doc.getField(uniqField).stringValue(); if (!ids.contains(uniqId)) { log.info(indexName + ":" + uniqField + ": <" + uniqId + "> not found in list of current/good values " + "assuming this has been deleted from Database and we " + "should remove it."); removeFromIndex(indexName, uniqField, uniqId); count++; } } } } catch (IOException e) { e.printStackTrace(); log.info("deleteRecordsNotInList() caught exception : " + e); } catch (IndexingException e) { e.printStackTrace(); log.info("deleteRecordsNotInList() caught exception : " + e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // } } } return count; }
From source file:com.revorg.goat.Document.java
License:Open Source License
/** * Returns the list of fields for a particular Document. * @param indexPath Directory that contains the Lucene Collection * @throws Exception/*from ww w .jav a 2 s. c o m*/ * @return ActionResult */ public static List getDocumentFields(String indexPath) { //Assign Document to Lucene Document org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); try { IndexReader reader = IndexReader.open(indexPath); doc = reader.document(0); reader.close(); List AllTheFields = doc.getFields(); return AllTheFields; } catch (Exception e) { ActionResultError = " caught a " + e.getClass() + " with message: " + e.getMessage(); System.out.println("Failure on getDocumentFields "); } ActionResult = "Failure"; return new LinkedList(); }
From source file:com.searchbox.SuggeterDataStructureBuilder.java
License:Apache License
private void iterateThroughDocuments(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) { IndexReader reader = searcher.getIndexReader(); // WARNING: returns null if there are no deletions Bits liveDocs = MultiFields.getLiveDocs(reader); maxNumDocs = Math.min(maxNumDocs, reader.maxDoc()); if (maxNumDocs == -1) { maxNumDocs = reader.maxDoc();//from ww w.j a v a 2s .co m } LOGGER.info("Analyzing docs:\t" + numdocs); for (int docID = 0; docID < reader.maxDoc(); docID++) { if (numdocs > maxNumDocs) { break; } if (liveDocs != null && !liveDocs.get(docID)) { continue; // deleted } if ((docID % 1000) == 0) { LOGGER.debug("Doing " + docID + " of " + maxNumDocs); } StringBuilder text = new StringBuilder(); for (String field : fields) { /* * not sure if this is the best way, might make sense to do a * process text for each field individually, but then book * keeping the doc freq for terms becomes a bit of a pain in the * ass */ try { IndexableField[] multifield = reader.document(docID).getFields(field); for (IndexableField singlefield : multifield) { // create one big string from all of the text in the // documents for processing later on text.append(". " + singlefield.stringValue()); } } catch (IOException ex) { LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring"); } } // might as well see if its empty if (text.length() > 0) { // actually processes the massive string which was created from // all of the above fields processText(text.toString().toLowerCase()); numdocs++; } } LOGGER.info("Number of documents analyzed: \t" + numdocs); for (int zz = 0; zz < counts.length; zz++) { LOGGER.info("Number of " + zz + "-grams: \t" + counts[zz]); } }
From source file:com.searchbox.Tagger.java
License:Apache License
private void DfCountBuilder(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) { IndexReader reader = searcher.getIndexReader(); Bits liveDocs = MultiFields.getLiveDocs(reader); // WARNING: returns null if // there are no deletions maxNumDocs = Math.min(maxNumDocs, reader.maxDoc()); if (maxNumDocs == -1) { maxNumDocs = reader.maxDoc();//from w w w . j a v a 2 s . c om } LOGGER.info("Analyzing docs:\t" + numdocs); for (int docID = 0; docID < reader.maxDoc(); docID++) { if (numdocs > maxNumDocs) { break; } if (liveDocs != null && !liveDocs.get(docID)) { continue; // deleted } if ((docID % 1000) == 0) { LOGGER.debug("Doing " + docID + " of " + maxNumDocs); } StringBuilder text = new StringBuilder(); for (String field : fields) { // not sure if this is the best way, might // make sense to do a // process text for each field individually, but then book keeping // the doc freq for terms becomes a bit of a pain in the ass try { text.append(". " + reader.document(docID).get(field)); } catch (IOException ex) { LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring"); } } if (text.length() > 0) { // might as well see if its empty processDocText(text.toString()); numdocs++; } } LOGGER.info("Number of documents analyzed: \t" + numdocs); dfcounts.put(DOC_COUNTS_STRING, numdocs); tfcounts.put(DOC_COUNTS_STRING, numdocs); }
From source file:com.searchcode.app.service.TimeCodeSearcher.java
/** * Only used as fallback if getByRepoFileName fails for some reason due to what appears to be a lucene index bug * Using this is problematic because if the index is updated while this method is called it will possibly * return the incorrect result. We could add a shared lock between them both but that's hardly ideal especially * since when its called the index could already be updated *///from w ww . ja v a2 s. c om public CodeResult getById(int documentId) { CodeResult codeResult = null; try { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH))); Document doc = reader.document(documentId); String filepath = doc.get(Values.PATH); List<String> code = new ArrayList<>(); try { code = Files.readAllLines(Paths.get(filepath), StandardCharsets.UTF_8); } catch (Exception ex) { LOGGER.warning("Indexed file appears to binary: " + filepath); } codeResult = new CodeResult(code, null); codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME)); codeResult.setFileName(doc.get(Values.FILENAME)); codeResult.setLanguageName(doc.get(Values.LANGUAGENAME)); codeResult.setMd5hash(doc.get(Values.MD5HASH)); codeResult.setCodeLines(doc.get(Values.CODELINES)); codeResult.setDocumentId(documentId); codeResult.setRepoName(doc.get(Values.REPONAME)); codeResult.setRepoLocation(doc.get(Values.REPOLOCATION)); codeResult.setCodeOwner(doc.get(Values.CODEOWNER)); reader.close(); } catch (Exception ex) { LOGGER.warning(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage()); } return codeResult; }
From source file:com.sun.socialsite.business.impl.LuceneSearchManagerImpl.java
License:Open Source License
/** * @return false if the index entry was not updated because it * was already current; true otherwise./* ww w .j a v a 2s . co m*/ */ public boolean addToIndex(final App app) throws IOException { boolean needNewEntry = true; String key = getKey(app); String url = app.getURL().toExternalForm(); String title = app.getTitle(); String description = app.getDescription(); IndexReader reader = IndexReader.open(indexDir); TermDocs termDocs = reader.termDocs(new Term("key", key)); while (termDocs.next()) { Document existingDoc = reader.document(termDocs.doc()); if (areEqual("app", existingDoc.get("class")) && areEqual(url, existingDoc.get("url")) && areEqual(title, existingDoc.get("title")) && areEqual(description, existingDoc.get("description"))) { needNewEntry = false; } } termDocs.close(); reader.close(); if (needNewEntry) { Document newDoc = new Document(); newDoc.add(new Field("key", key, Field.Store.YES, Field.Index.UN_TOKENIZED)); newDoc.add(new Field("class", "app", Field.Store.YES, Field.Index.UN_TOKENIZED)); newDoc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED)); if (title != null) newDoc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED)); if (description != null) newDoc.add(new Field("description", description, Field.Store.YES, Field.Index.TOKENIZED)); IndexWriter writer = null; try { writer = new IndexWriter(indexDir, analyzer, false); writer.deleteDocuments(new Term("key", key)); // Delete old entry, if present writer.addDocument(newDoc); } finally { if (writer != null) try { writer.close(); } catch (Exception e) { } ; } log.trace(String.format("Indexed app[url=%s,title=%s,description=%s]", url, title, description)); } return needNewEntry; }
From source file:com.sun.socialsite.business.impl.LuceneSearchManagerImpl.java
License:Open Source License
/** * @return false if the index entry was not updated because it * was already current; true otherwise.//from w ww . j ava 2s . co m */ public boolean addToIndex(final Group group) throws IOException { boolean needNewEntry = true; String key = getKey(group); String handle = group.getHandle(); String name = group.getName(); String description = group.getDescription(); IndexReader reader = IndexReader.open(indexDir); TermDocs termDocs = reader.termDocs(new Term("key", key)); while (termDocs.next()) { Document existingDoc = reader.document(termDocs.doc()); if (areEqual("group", existingDoc.get("class")) && areEqual(handle, existingDoc.get("handle")) && areEqual(name, existingDoc.get("name")) && areEqual(description, existingDoc.get("description"))) { needNewEntry = false; } } termDocs.close(); reader.close(); if (needNewEntry) { Document newDoc = new Document(); newDoc.add(new Field("key", key, Field.Store.YES, Field.Index.UN_TOKENIZED)); newDoc.add(new Field("class", "group", Field.Store.YES, Field.Index.UN_TOKENIZED)); newDoc.add(new Field("handle", handle, Field.Store.YES, Field.Index.TOKENIZED)); newDoc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED)); if (description != null) newDoc.add(new Field("description", description, Field.Store.YES, Field.Index.TOKENIZED)); IndexWriter writer = null; try { writer = new IndexWriter(indexDir, analyzer, false); writer.deleteDocuments(new Term("key", key)); // Delete old entry, if present writer.addDocument(newDoc); } finally { if (writer != null) try { writer.close(); } catch (Exception e) { } ; } log.trace(String.format("Indexed group[handle=%s,name=%s,description=%s]", name, handle, description)); } return needNewEntry; }