List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.LuceneDocIterator.java
License:Open Source License
public LuceneDocIterator(IndexReader reader, Set<String> fieldsToLoad) { this.reader = reader; this.fieldsToLoad = fieldsToLoad; pointer = 0;/*from w ww . jav a2 s . c o m*/ max = reader.numDocs(); }
From source file:edu.mit.ll.vizlinc.highlight.QueryTermExtractor.java
License:Apache License
/** * Extracts all terms texts of a given Query into an array of WeightedTerms * * @param query Query to extract term texts from * @param reader used to compute IDF which can be used to a) score selected fragments better * b) use graded highlights eg changing intensity of font color * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based * @return an array of the terms used in a query, plus their weights. *///w w w . ja v a 2s . c om public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) { WeightedTerm[] terms = getTerms(query, false, fieldName); int totalNumDocs = reader.numDocs(); for (int i = 0; i < terms.length; i++) { try { int docFreq = reader.docFreq(new Term(fieldName, terms[i].term)); // docFreq counts deletes if (totalNumDocs < docFreq) { docFreq = totalNumDocs; } //IDF algorithm taken from DefaultSimilarity class float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); terms[i].weight *= idf; } catch (IOException e) { //ignore } } return terms; }
From source file:edu.mit.ll.vizlinc.highlight.WeightedSpanTermExtractor.java
License:Apache License
/** * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied * <code>IndexReader</code> to properly weight terms (for gradient highlighting). * //from w ww .j a v a 2 s. c o m * <p> * * @param query * that caused hit * @param tokenStream * of text to be highlighted * @param fieldName * restricts Term's used based on field name * @param reader * to use for scoring * @return Map of WeightedSpanTerms with quasi tf/idf scores * @throws IOException */ public Map<String, WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName, IndexReader reader) throws IOException { if (fieldName != null) { this.fieldName = StringHelper.intern(fieldName); } else { this.fieldName = null; } this.tokenStream = tokenStream; Map<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>(); extract(query, terms); int totalNumDocs = reader.numDocs(); Set<String> weightedTerms = terms.keySet(); Iterator<String> it = weightedTerms.iterator(); try { while (it.hasNext()) { WeightedSpanTerm weightedSpanTerm = terms.get(it.next()); int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); // docFreq counts deletes if (totalNumDocs < docFreq) { docFreq = totalNumDocs; } // IDF algorithm taken from DefaultSimilarity class float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); weightedSpanTerm.weight *= idf; } } finally { closeReaders(); } return terms; }
From source file:edu.rpi.tw.linkipedia.search.main.helper.ReadIndex.java
License:Open Source License
public static void main(String[] args) { try {/*from w w w . j a v a 2 s. c om*/ if (args.length < 1) { System.out.println("index directory"); return; } INDEX_DIR = args[0]; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(INDEX_DIR))); IndexSearcher searcher = new IndexSearcher(reader); System.out.println(reader.numDocs()); while (true) { BufferedReader in = null; String text = ""; try { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); text = in.readLine(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } String[] mytext = text.split("\\|"); if (mytext.length > 1) { text = mytext[0]; } if (text.contains(":")) { String[] fiedValue = text.split(":", 2); readIndexByTerm(searcher, fiedValue[0], fiedValue[1], mytext[1]);//readIndexByTerm(reader,text); } } } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.stanford.muse.index.Indexer.java
License:Apache License
/** * sets up indexer just for reading... if needed for writing only, call * setupForWrite. if need both read & write, call both. *//*from w w w . j a v a 2 s . co m*/ synchronized void setupForRead() { log.info("setting up index for read only access"); long startTime = System.currentTimeMillis(); //closeHandles(); try { setupDirectory(); String[] defaultSearchFields, defaultSearchFieldsOriginal; String[] defaultSearchFieldSubject = new String[] { "title" }; // for subject only search String[] defaultSearchFieldCorrespondents; //body field should be there, as the content of the attachment lies in this field, should also include meta field? //why the search over en-names and en-names-original when body/body_original is included in the search fields? defaultSearchFields = new String[] { "body", "title", "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" }; defaultSearchFieldsOriginal = new String[] { "body_original", "title" }; // we want to leave title there because we want to always hit the title -- discussed with Peter June 27 2015 defaultSearchFieldCorrespondents = new String[] { "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" }; // names field added above after email discussion with Sit 6/11/2013. problem is that we're not using the Lucene EnglishPossessiveFilter, so // NER will extract the name Stanford University in a sentence like: // "This is Stanford University's website." // but when the user clicks on the name "Stanford University" in say monthly cards, we // will not match the message with this sentence because of the apostrophe. //for searching an attchment with fileName String[] metaSearchFields = new String[] { "fileName" }; // Parse a simple query that searches for "text": if (parser == null) { //parser = new QueryParser(MUSE_LUCENE_VERSION, defaultSearchField, analyzer); parser = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFields, analyzer); parserOriginal = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldsOriginal, analyzer); parserSubject = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldSubject, analyzer); parserCorrespondents = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldCorrespondents, analyzer); parserMeta = new MultiFieldQueryParser(LUCENE_VERSION, metaSearchFields, new KeywordAnalyzer()); } /** * Bunch of gotchas here * Its a bad idea to store lucene internal docIds, as no assumptions about the internal docIds should be made; * not even that they are serial. When searching, lucene may ignore logically deleted docs. * Lucene does not handle deleted docs, and having these docs in search may bring down the search performance by 50% * Deleted docs are cleaned only during merging of indices.*/ int numContentDocs = 0, numContentDeletedDocs = 0, numAttachmentDocs = 0, numAttachmentDeletedDocs = 0; if (DirectoryReader.indexExists(directory)) { DirectoryReader ireader = DirectoryReader.open(directory); if (ireader.numDeletedDocs() > 0) log.warn("!!!!!!!\nIndex reader has " + ireader.numDocs() + " doc(s) of which " + ireader.numDeletedDocs() + " are deleted)\n!!!!!!!!!!"); isearcher = new IndexSearcher(ireader); contentDocIds = new LinkedHashMap<>(); numContentDocs = ireader.numDocs(); numContentDeletedDocs = ireader.numDeletedDocs(); Bits liveDocs = MultiFields.getLiveDocs(ireader); Set<String> fieldsToLoad = new HashSet<>(); fieldsToLoad.add("docId"); for (int i = 0; i < ireader.maxDoc(); i++) { org.apache.lucene.document.Document doc = ireader.document(i, fieldsToLoad); if (liveDocs != null && !liveDocs.get(i)) continue; if (doc == null || doc.get("docId") == null) continue; contentDocIds.put(i, doc.get("docId")); } log.info("Loaded: " + contentDocIds.size() + " content docs"); } if (DirectoryReader.indexExists(directory_blob)) { IndexReader ireader_blob = DirectoryReader.open(directory_blob); isearcher_blob = new IndexSearcher(ireader_blob); // read-only=true blobDocIds = new LinkedHashMap<Integer, String>(); numAttachmentDocs = ireader_blob.numDocs(); numAttachmentDeletedDocs = ireader_blob.numDeletedDocs(); Bits liveDocs = MultiFields.getLiveDocs(ireader_blob); Set<String> fieldsToLoad = new HashSet<String>(); fieldsToLoad.add("docId"); for (int i = 0; i < ireader_blob.maxDoc(); i++) { org.apache.lucene.document.Document doc = ireader_blob.document(i, fieldsToLoad); if (liveDocs != null && !liveDocs.get(i)) continue; if (doc == null || doc.get("docId") == null) continue; blobDocIds.put(i, doc.get("docId")); } log.info("Loaded: " + blobDocIds.size() + " attachment docs"); } log.warn("Number of content docs: " + numContentDocs + ", number deleted: " + numContentDeletedDocs); log.warn("Number of attachment docs: " + numAttachmentDocs + ", number deleted: " + numAttachmentDeletedDocs); if (dirNameToDocIdMap == null) dirNameToDocIdMap = new LinkedHashMap<String, Map<Integer, String>>(); } catch (Exception e) { Util.print_exception(e, log); } log.info("Setting up index for read took " + (System.currentTimeMillis() - startTime) + " ms"); }
From source file:edu.stanford.muse.index.Indexer.java
License:Apache License
private synchronized Directory copyDirectoryExcludeFields(Directory dir, String out_basedir, String out_name, String... fields_to_be_removed) throws IOException { IndexReader reader = DirectoryReader.open(dir); // IndexReader.open(dir, true); // read-only=true Directory newDir = createDirectory(out_basedir, out_name); IndexWriter writer = openIndexWriter(newDir); //log.info("Removing field(s) " + Util.join(fields_to_be_removed, ", ") + " from index."); for (int i = 0; i < reader.numDocs(); i++) { org.apache.lucene.document.Document doc = reader.document(i); for (String field : fields_to_be_removed) doc.removeFields(field);//from w ww . j av a2 s. co m writer.addDocument(doc); } writer.close(); reader.close(); return newDir; }
From source file:edu.stanford.muse.index.Indexer.java
License:Apache License
private synchronized Directory copyDirectoryWithDocFilter(Directory dir, String out_basedir, String out_name, FilterFunctor filter_func) throws IOException { long startTime = System.currentTimeMillis(); IndexReader reader = DirectoryReader.open(dir); // IndexReader.open(dir, true); // read-only=true Directory newDir = createDirectory(out_basedir, out_name); IndexWriter writer = openIndexWriter(newDir); //log.info("Removing field(s) " + Util.join(fields_to_be_removed, ", ") + " from index."); int count = 0; for (int i = 0; i < reader.numDocs(); i++) { org.apache.lucene.document.Document doc = reader.document(i); if (filter_func == null || filter_func.filter(doc)) { writer.addDocument(doc);/*from www. jav a 2s. com*/ count++; } } writer.close(); reader.close(); log.info("CopyDirectoryWithtDocFilter to dir:" + out_basedir + " name: " + baseDir + " time: " + (System.currentTimeMillis() - startTime) + " ms docs: " + count); return newDir; }
From source file:edu.umd.umiacs.clip.tools.scor.BM25Scorer.java
License:Apache License
public BM25Scorer(IndexReader ir, String field) { super(ir, field); k1 = 1.2f;/*from ww w . j a v a 2 s. c om*/ b = 0.75f; try { avgdl = ir.getSumTotalTermFreq(field) / (float) ir.numDocs(); } catch (IOException e) { throw new UncheckedIOException(e); } cache = new float[(int) (avgdl * 10)]; for (int i = 0; i < cache.length; i++) { cache[i] = k1 * (1 - b + b * (i / avgdl)); } }
From source file:edu.umd.umiacs.clip.tools.scor.TFIDF.java
License:Apache License
public TFIDF(IndexReader ir, String field) { this.ir = ir; this.field = field; N = ir.numDocs(); }
From source file:edu.unika.aifb.graphindex.index.KeywordIndexBuilder.java
License:Open Source License
public void indexKeywords() throws StorageException, IOException { File indexDir = idxDirectory.getDirectory(IndexDirectory.KEYWORD_DIR, !resume); File valueDir = idxDirectory.getDirectory(IndexDirectory.VALUE_DIR, !resume); this.objectProperties = Util.readEdgeSet(idxDirectory.getFile(IndexDirectory.OBJECT_PROPERTIES_FILE)); this.relations = Util.readEdgeSet(idxDirectory.getTempFile("relations", false)); this.attributes = Util.readEdgeSet(idxDirectory.getTempFile("attributes", false)); properties = new HashSet<String>(); properties.addAll(relations);//from w w w.j a va 2s. co m properties.addAll(attributes); log.debug("attributes: " + attributes.size() + ", relations: " + relations.size()); try { // HyphenationCompoundWordAnalyzer analyzer = new HyphenationCompoundWordAnalyzer("./res/en_hyph_US.xml", "./res/en_US.dic"); // DictionaryCompoundWordAnalyzer analyzer = new DictionaryCompoundWordAnalyzer("./res/en_US.dic"); CapitalizationSplitterAnalyzer analyzer = new CapitalizationSplitterAnalyzer(); StandardAnalyzer valueAnalyzer = new StandardAnalyzer(); IndexWriter indexWriter = new IndexWriter(indexDir, analyzer, !resume, new MaxFieldLength(MAXFIELDLENGTH)); log.debug("max terms per field: " + indexWriter.getMaxFieldLength()); valueWriter = new IndexWriter(valueDir, valueAnalyzer, !resume, new MaxFieldLength(MAXFIELDLENGTH)); org.apache.lucene.index.IndexReader reader = null; if (resume) { reader = org.apache.lucene.index.IndexReader.open(FSDirectory.getDirectory(indexDir), true); log.debug("docs: " + reader.numDocs()); } if (!resume) { log.info("Indexing concepts"); indexSchema(indexWriter, idxDirectory.getTempFile("concepts", false), TypeUtil.CONCEPT, CONCEPT_BOOST); log.info("Indexing attributes"); indexSchema(indexWriter, idxDirectory.getTempFile("attributes", false), TypeUtil.ATTRIBUTE, ATTRIBUTE_BOOST); log.info("Indexing relations"); indexSchema(indexWriter, idxDirectory.getTempFile("relations", false), TypeUtil.RELATION, RELATION_BOOST); } log.info("Indexing entities"); indexEntity(indexWriter, idxDirectory.getTempFile("entities", false), reader); indexWriter.commit(); valueWriter.commit(); log.debug("optimizing..."); indexWriter.optimize(); valueWriter.optimize(); indexWriter.close(); valueWriter.close(); if (blockSearcher != null) blockSearcher.close(); ns.optimize(); ns.close(); } catch (IOException e) { e.printStackTrace(); } catch (DatabaseException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }