List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testIterateThroughDocumentVector() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); int numDocs = reader.numDocs(); // Iterate through the document vectors for (int i = 0; i < numDocs; i++) { System.out.println(reader.document(i)); Terms terms = reader.getTermVector(i, "text"); TermsEnum te = terms.iterator(); // For this document, iterate through the terms. Term term;//from w w w. ja v a 2s . com while (te.next() != null) { term = new Term("text", te.term()); long tf = te.totalTermFreq(); // Print out the term and its term frequency System.out.println(term.bytes().utf8ToString() + " " + tf); } } }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testIterateThroughDocumentVectorComputeBM25() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); int numDocs = reader.numDocs(); // Iterate through the document vectors for (int i = 0; i < numDocs; i++) { String docid = reader.document(i).getField("docid").stringValue(); System.out.println(reader.document(i)); System.out.println(i + ": " + docid); Terms terms = reader.getTermVector(i, "text"); TermsEnum te = terms.iterator(); // For this document, iterate through the terms. while (te.next() != null) { String term = new Term("text", te.term()).bytes().utf8ToString(); long tf = te.totalTermFreq(); // The way to compute the BM25 score is to issue a query with the exact docid and the // term in question, and look at the retrieval score. Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid Query termQuery = new TermQuery(new Term("text", term)); // the term BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both builder.add(filterQuery, BooleanClause.Occur.MUST); builder.add(termQuery, BooleanClause.Occur.MUST); Query finalQuery = builder.build(); TopDocs rs = searcher.search(finalQuery, 1); // issue the query // The BM25 weight is the maxScore System.out.println(term + " " + tf + " " + rs.getMaxScore()); }//from w w w. j av a 2 s .c om } }
From source file:io.datalayer.lucene.delete.LuceneDeleteTest.java
License:Apache License
@Test public void testDelete() throws IOException { IndexWriter writer = AosIndexUtil.newIndexWithDocuments(); Term term = new Term(ID, "1"); Query query = new TermQuery(term); IndexReader reader = DirectoryReader.open(writer, true); IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(writer, true)); TopDocs topDocs = indexSearcher.search(query, 1); LOGGER.info("" + topDocs.scoreDocs[0].doc); assertNotNull(reader.document(topDocs.scoreDocs[0].doc)); LOGGER.info("Deleting documents containing " + term); writer.deleteDocuments(term);//from ww w . j av a 2 s . c o m // writer.deleteDocuments(query); writer.commit(); indexSearcher = new IndexSearcher(DirectoryReader.open(writer, true)); topDocs = indexSearcher.search(query, 1); assertEquals(0, topDocs.scoreDocs.length); reader.close(); writer.close(); }
From source file:ir.project.TFIDFMatrix.java
private void createMatrix() { try {/*www . ja v a2 s . c om*/ this.matrix = new TFIDFBookVector[numDocs]; IndexReader reader = DirectoryReader.open(this.index); for (int i = 0; i < numDocs; i++) { Terms vector = reader.getTermVector(i, "text"); // get title IndexableField titleField = reader.document(i).getField("title"); String title = titleField.stringValue(); // get isbn IndexableField isbnField = reader.document(i).getField("isbn"); String isbn = isbnField.stringValue(); // get author IndexableField authorField = reader.document(i).getField("author"); String author = authorField.stringValue(); this.matrix[i] = new TFIDFBookVector(numTerms, title, isbn, author); if (vector == null) { System.err.println("Vector is null"); continue; } TermsEnum it = vector.iterator(); while (it.next() != null) { Term t = new Term("text", it.term().utf8ToString()); // TotalTermFreq returns frequency of term in document. Long tf = it.totalTermFreq(); double idf = (double) 1 / (double) reader.totalTermFreq(t); double tfIdfWeight = tf * idf; // put TF-IDF weight in matrix int termIndex = this.termMap.get(it.term().utf8ToString()); this.matrix[i].editValue(termIndex, tfIdfWeight); } } reader.close(); } catch (IOException ex) { Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java
License:Apache License
/** * Loads the map containing the conversion from the Wikipedia ids to the * Lucene Ids./*from ww w . ja v a 2 s . c o m*/ */ protected void parseWikiIdToLuceneId() { logger.warn("no index wikiID -> lucene found - I'll generate"); IndexReader reader = getReader(); wikiIdToLuceneId = new HashMap<Integer, Integer>(reader.numDocs()); ProgressLogger pl = new ProgressLogger("creating wiki2lucene, readed {} docs", 100000); int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { pl.up(); try { Document doc = reader.document(i); IndexableField f = doc.getField(LUCENE_ARTICLE_ID); Integer wikiId = new Integer(f.stringValue()); wikiIdToLuceneId.put(wikiId, i); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java
License:Apache License
private Document getDoc(int wikiId) { IndexReader reader = getReader(); // System.out.println("get docId "+pos); if (wikiId <= 0) return null; int docId = getLuceneId(wikiId); if (docId < 0) { logger.warn("no id for wikiId {}", wikiId); return null; }//ww w .jav a 2s .co m logger.debug("get wikiId {} -> docId {}", wikiId, docId); Document doc = null; try { doc = reader.document(docId); } catch (Exception e) { logger.error("retrieving doc in position {} {}", docId, e.toString()); System.exit(-1); } return doc; }
From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java
License:Apache License
public int getWikiId(int luceneId) { IndexReader reader = getReader(); // System.out.println("get docId "+pos); Document doc = null;/*from w ww.j a v a2 s .c om*/ try { doc = reader.document(luceneId); } catch (Exception e) { logger.error("retrieving doc in position {} {}", luceneId, e.toString()); System.exit(-1); } return Integer.parseInt(doc.get(LUCENE_ARTICLE_ID)); }
From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java
License:Open Source License
protected Set<String> deleteContainerAndBelow(String nodeRef, IndexReader reader, boolean delete, boolean cascade) throws LuceneIndexException { Set<String> refs = new LinkedHashSet<String>(); List<Integer> deletions = new ArrayList<Integer>(); try {/*from www. j ava 2 s. c om*/ if (delete) { service.delete(new Term("ID", nodeRef), deltaId, getRepoStorePath()); } refs.add(nodeRef); if (cascade) { TermDocs td = reader.termDocs(new Term("ANCESTOR", nodeRef)); while (td.next()) { int doc = td.doc(); Document document = reader.document(doc); String[] ids = document.getValues("ID"); refs.add(ids[ids.length - 1]); if (delete) { deletions.add(new Integer(doc)); // service.delete(doc, deltaId); } } if (delete) { service.delete(deletions, deltaId, getRepoStorePath()); } } } catch (IOException e) { throw new LuceneIndexException("Failed to delete container and below for " + nodeRef, e); } return refs; }
From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java
License:Open Source License
protected Set<String> deletePrimary(Collection<String> nodeRefs, IndexReader reader, boolean delete) throws LuceneIndexException { Set<String> refs = new LinkedHashSet<String>(); List<Integer> deletions = new ArrayList<Integer>(); for (String nodeRef : nodeRefs) { try {/* www .j a v a 2s . c o m*/ TermDocs td = reader.termDocs(new Term("PRIMARYPARENT", nodeRef)); while (td.next()) { int doc = td.doc(); Document document = reader.document(doc); String[] ids = document.getValues("ID"); refs.add(ids[ids.length - 1]); if (delete) { deletions.add(new Integer(doc)); } } if (delete) { service.delete(deletions, deltaId, getRepoStorePath()); } } catch (IOException e) { throw new LuceneIndexException("Failed to delete node by primary parent for " + nodeRef, e); } } return refs; }
From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java
License:Open Source License
protected Set<String> deleteReference(Collection<String> nodeRefs, IndexReader reader, boolean delete) throws LuceneIndexException { Set<String> refs = new LinkedHashSet<String>(); List<Integer> deletions = new ArrayList<Integer>(); for (String nodeRef : nodeRefs) { try {//from ww w . ja va 2s . c om TermDocs td = reader.termDocs(new Term("PARENT", nodeRef)); while (td.next()) { int doc = td.doc(); Document document = reader.document(doc); String[] ids = document.getValues("ID"); refs.add(ids[ids.length - 1]); if (delete) { deletions.add(new Integer(doc)); // service.delete(doc, deltaId); } } if (delete) { service.delete(deletions, deltaId, getRepoStorePath()); } } catch (IOException e) { throw new LuceneIndexException("Failed to delete node by parent for " + nodeRef, e); } } return refs; }