List of usage examples for org.apache.lucene.search TotalHitCountCollector TotalHitCountCollector
TotalHitCountCollector
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * count the number of documents in the index having at least a value for the 'class' field * * @return the no. of documents having a value for the 'class' field * @throws IOException if accessing to term vectors or search fails *//*w ww . j av a2s. co m*/ protected int countDocsWithClass() throws IOException { int docCount = MultiFields.getTerms(this.leafReader, this.classFieldName).getDocCount(); if (docCount == -1) { // in case codec doesn't support getDocCount TotalHitCountCollector classQueryCountCollector = new TotalHitCountCollector(); BooleanQuery.Builder q = new BooleanQuery.Builder(); q.add(new BooleanClause( new WildcardQuery(new Term(classFieldName, String.valueOf(WildcardQuery.WILDCARD_STRING))), BooleanClause.Occur.MUST)); if (query != null) { q.add(query, BooleanClause.Occur.MUST); } indexSearcher.search(q.build(), classQueryCountCollector); docCount = classQueryCountCollector.getTotalHits(); } return docCount; }
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Returns the number of documents of the input class ( from the whole index or from a subset) * that contains the word ( in a specific field or in all the fields if no one selected) * @param word the token produced by the analyzer * @param term the term representing the class * @return the number of documents of the input class * @throws IOException if a low level I/O problem happens *//*from w ww . j av a 2s .co m*/ private int getWordFreqForClass(String word, Term term) throws IOException { BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); BooleanQuery.Builder subQuery = new BooleanQuery.Builder(); for (String textFieldName : textFieldNames) { subQuery.add( new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.SHOULD)); } booleanQuery.add(new BooleanClause(subQuery.build(), BooleanClause.Occur.MUST)); booleanQuery.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.MUST)); if (query != null) { booleanQuery.add(query, BooleanClause.Occur.MUST); } TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector(); indexSearcher.search(booleanQuery.build(), totalHitCountCollector); return totalHitCountCollector.getTotalHits(); }
From source file:IndexAndSearchOpenStreetMaps1D.java
License:Apache License
private static void queryIndex() throws IOException { Directory dir = FSDirectory.open(Paths.get("/l/tmp/1dkd" + (USE_NF ? "_nf" : ""))); System.out.println("DIR: " + dir); IndexReader r = DirectoryReader.open(dir); System.out.println("maxDoc=" + r.maxDoc()); IndexSearcher s = new IndexSearcher(r); //System.out.println("reader MB heap=" + (reader.ramBytesUsed()/1024/1024.)); // London, UK: int STEPS = 5; double MIN_LAT = 51.0919106; double MAX_LAT = 51.6542719; double MIN_LON = -0.3867282; double MAX_LON = 0.8492337; byte[] scratch1 = new byte[4]; byte[] scratch2 = new byte[4]; for (int iter = 0; iter < 100; iter++) { long tStart = System.nanoTime(); long totHits = 0; int queryCount = 0; for (int latStep = 0; latStep < STEPS; latStep++) { double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS; for (int lonStep = 0; lonStep < STEPS; lonStep++) { double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS; for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) { double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS; for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) { double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS; Query q;/* w ww . j a v a 2 s . c om*/ if (USE_NF) { q = LegacyNumericRangeQuery.newIntRange("latnum", (int) (1000000. * lat), (int) (1000000. * latEnd), true, true); } else { q = IntPoint.newRangeQuery("lat", (int) (1000000. * lat), (int) (1000000. * latEnd)); } TotalHitCountCollector c = new TotalHitCountCollector(); //long t0 = System.nanoTime(); s.search(q, c); //System.out.println("\nITER: now query lat=" + lat + " latEnd=" + latEnd + " lon=" + lon + " lonEnd=" + lonEnd); //Bits hits = reader.intersect(lat, latEnd, lon, lonEnd); //System.out.println(" total hits: " + hitCount); //totHits += ((FixedBitSet) hits).cardinality(); //System.out.println(" add tot " + c.getTotalHits()); totHits += c.getTotalHits(); queryCount++; } } } } long tEnd = System.nanoTime(); System.out.println("ITER: " + iter + " " + ((tEnd - tStart) / 1000000000.0) + " sec; totHits=" + totHits + "; " + queryCount + " queries"); if (iter == 0) { long bytes = 0; for (LeafReaderContext ctx : r.leaves()) { CodecReader cr = (CodecReader) ctx.reader(); System.out.println(Accountables.toString(cr)); bytes += cr.ramBytesUsed(); } System.out.println("READER MB: " + (bytes / 1024. / 1024.)); System.out.println("RAM: " + Accountables.toString((Accountable) r.leaves().get(0).reader())); } } IOUtils.close(r, dir); }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
/** * Returns the number of documents of the input class ( from the whole index or from a subset) * that contains the word ( in a specific field or in all the fields if no one selected) * * @param word the token produced by the analyzer * @param fieldName the field the word is coming from * @param term the class term/*from ww w. j a v a 2 s. co m*/ * @return number of documents of the input class * @throws java.io.IOException If there is a low-level I/O error */ private int getWordFreqForClass(String word, String fieldName, Term term) throws IOException { BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); BooleanQuery.Builder subQuery = new BooleanQuery.Builder(); subQuery.add(new BooleanClause(new TermQuery(new Term(fieldName, word)), BooleanClause.Occur.SHOULD)); booleanQuery.add(new BooleanClause(subQuery.build(), BooleanClause.Occur.MUST)); booleanQuery.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.MUST)); if (query != null) { booleanQuery.add(query, BooleanClause.Occur.MUST); } TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector(); indexSearcher.search(booleanQuery.build(), totalHitCountCollector); return totalHitCountCollector.getTotalHits(); }
From source file:com.b2international.index.lucene.SearchWarmerFactory.java
License:Apache License
@Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) throws IOException { IndexSearcher searcher = super.newSearcher(reader, previousReader); // TODO: experiment with different queries (MatchAllDocs, a set of "typical" queries, etc.) final BooleanQuery.Builder query = new BooleanQuery.Builder(); query.add(new TermQuery(new Term(EMPTY_STRING, EMPTY_STRING)), Occur.MUST); searcher.search(query.build(), new TotalHitCountCollector()); return searcher; }
From source file:com.b2international.snowowl.snomed.api.impl.ClassificationRunIndex.java
License:Apache License
public void invalidateClassificationRuns() throws IOException { final Query statusQuery = Fields.newQuery().field(FIELD_STATUS, ClassificationStatus.COMPLETED.name()) .field(FIELD_STATUS, ClassificationStatus.RUNNING.name()) .field(FIELD_STATUS, ClassificationStatus.SAVING_IN_PROGRESS.name()) .field(FIELD_STATUS, ClassificationStatus.SCHEDULED.name()).matchAny(); final Query query = Fields.newQuery().field(FIELD_CLASS, ClassificationRun.class.getSimpleName()) .and(statusQuery).matchAll(); IndexSearcher searcher = null;/*from w ww . j a v a 2 s .com*/ try { searcher = manager.acquire(); final TotalHitCountCollector collector = new TotalHitCountCollector(); searcher.search(query, collector); final int totalHits = collector.getTotalHits(); final int docsToRetrieve = Ints.min(searcher.getIndexReader().maxDoc(), totalHits); if (docsToRetrieve < 1) { return; } final TopDocs docs = searcher.search(query, docsToRetrieve, Sort.INDEXORDER, false, false); final ScoreDoc[] scoreDocs = docs.scoreDocs; final ObjectReader reader = objectMapper.reader(ClassificationRun.class); for (int i = 0; i < scoreDocs.length; i++) { final Document sourceDocument = searcher.doc(scoreDocs[i].doc, ImmutableSet.of(FIELD_BRANCH_PATH, FIELD_SOURCE)); final String branchPath = sourceDocument.get(FIELD_BRANCH_PATH); final String source = sourceDocument.get(FIELD_SOURCE); final ClassificationRun run = reader.readValue(source); run.setStatus(ClassificationStatus.STALE); upsertClassificationRunNoCommit(branchPath, run); } commit(); } finally { if (null != searcher) { manager.release(searcher); } } }
From source file:com.b2international.snowowl.snomed.api.impl.ClassificationRunIndex.java
License:Apache License
private <T> List<T> search(final Query query, final Class<? extends T> sourceClass, Sort sort, final int offset, final int limit) throws IOException { IndexSearcher searcher = null;//from w w w . ja v a2 s. c o m try { searcher = manager.acquire(); final TotalHitCountCollector collector = new TotalHitCountCollector(); searcher.search(query, collector); final int totalHits = collector.getTotalHits(); final int saturatedSum = Ints.saturatedCast((long) offset + limit); final int docsToRetrieve = Ints.min(saturatedSum, searcher.getIndexReader().maxDoc(), totalHits); final ImmutableList.Builder<T> resultBuilder = ImmutableList.builder(); if (docsToRetrieve < 1) { return resultBuilder.build(); } final TopDocs docs = searcher.search(query, docsToRetrieve, sort, false, false); final ScoreDoc[] scoreDocs = docs.scoreDocs; final ObjectReader reader = objectMapper.reader(sourceClass); for (int i = offset; i < docsToRetrieve && i < scoreDocs.length; i++) { final Document sourceDocument = searcher.doc(scoreDocs[i].doc, ImmutableSet.of(FIELD_SOURCE)); final String source = sourceDocument.get(FIELD_SOURCE); final T deserializedSource = reader.readValue(source); resultBuilder.add(deserializedSource); } return resultBuilder.build(); } finally { if (null != searcher) { manager.release(searcher); } } }
From source file:com.b2international.snowowl.snomed.api.impl.ClassificationRunIndex.java
License:Apache License
private int getHitCount(final Query query) throws IOException { IndexSearcher searcher = null;//from ww w . j a va2s. co m try { searcher = manager.acquire(); final TotalHitCountCollector collector = new TotalHitCountCollector(); searcher.search(query, collector); return collector.getTotalHits(); } finally { if (null != searcher) { manager.release(searcher); } } }
From source file:com.hrstc.lucene.queryexpansion.PatentClassCodeBasedQueryExpansion.java
License:Apache License
/** * Performs Rocchio's query expansion with pseudo feedback for each fields * separatlly qm = alpha * query + ( beta / relevanDocsCount ) * Sum ( rel * docs vector )//from w ww. ja v a2 s . c o m * * @param query * * @return expandedQuery * * @throws IOException * @throws ParseException */ @Override public Query expandQuery(PatentQuery query) throws ParseException, IOException { BooleanQuery bQuery = new BooleanQuery(); BooleanQuery bQueryFieldsExpanded = new BooleanQuery(); BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE); //******************************************************************** //**************** Get the sec of definition codes ******************* //******************************************************************** TotalHitCountCollector collector = new TotalHitCountCollector(); Query codesQuery = GenerateClassCodesQuery.generateQuery(query.getFullClassCodes()); // System.err.println(codesQuery); classCodesSearcher.search(codesQuery, collector); IndexReader ir = classCodesSearcher.getIndexReader(); TopDocs hits = classCodesSearcher.search(codesQuery, Math.max(1, collector.getTotalHits())); // Compute PRF set // System.err.println("Found " + hits.totalHits // + " document(s) that matched query '" // + codesQuery + "':"); // for (ScoreDoc scoreDoc : hits.scoreDocs) { // System.out.println("----------"); // Document doc = classCodesSearcher.doc(scoreDoc.doc); // System.out.println(scoreDoc.score + "\t" + doc.get(PatentDocument.Classification) + "\t" + doc.get(PatentDocument.Title));// + "\t" + doc.get("type") + "\t" + doc.get("num") + "\t" + doc.get("lang")); //// System.out.println(explanation.toString()); // } // System.out.println("*************************************"); Query expandedQuery = null; ClassCodeBasedQueryExpansion queryExpansion = new ClassCodeBasedQueryExpansion(hits, ir, parameters, Nbr_Terms); for (int i = 1; i < PatentQuery.getFields().length; i++) { if (query.getQueries()[i] != null && !query.getQueries()[i].equals("") && (i != 4 || i != 6) && query.getBoosts().get(PatentQuery.getFields()[i]) != 0) { QueryParser qp = new QueryParser(Version.LUCENE_48, PatentQuery.getFields()[i], new StandardAnalyzer(Version.LUCENE_48)); BooleanQuery bQueryFields = new BooleanQuery();// Contain a field to make the PRF field by field Query q = qp.parse(query.getQueries()[i]); if (query.isFilter()) { Query filter = new QueryParser(Version.LUCENE_48, PatentQuery.getFields()[0], new StandardAnalyzer(Version.LUCENE_48)).parse(query.getQueries()[0]); bQueryFields.add(filter, BooleanClause.Occur.MUST); } if (!(q instanceof BooleanQuery) || ((BooleanQuery) q).getClauses().length > 0) { bQueryFields.add(q, BooleanClause.Occur.MUST); } // System.err.println(hits.totalHits + " total matching documents for field " + query.getFields()[i] + "."); if (expandedQuery == null) { expandedQuery = queryExpansion.expandQuery(q, PatentQuery.getFields()[i]); } else { BooleanQuery bq = ((BooleanQuery) expandedQuery).clone(); BooleanQuery bq2 = new BooleanQuery(); for (BooleanClause bc : bq.clauses()) { TermQuery tq = (TermQuery) bc.getQuery(); Term term = new Term(PatentQuery.getFields()[i], tq.getTerm().text()); TermQuery tq2 = new TermQuery(term); tq2.setBoost(tq.getBoost()); bq2.add(tq2, BooleanClause.Occur.SHOULD); } expandedQuery = bq2; } bQueryFieldsExpanded.add(expandedQuery, BooleanClause.Occur.SHOULD);// Compute the new expanded query based on PRF set // System.err.println("Expanded Query: " + expandedQuery); // hits = searcher.search(expandedQuery, 100); // System.err.println(hits.totalHits + " total matching documents"+ query.getFields()[i] + "."); } } if (query.isFilter()) { Query q = new QueryParser(Version.LUCENE_48, PatentQuery.getFields()[0], new StandardAnalyzer(Version.LUCENE_48)).parse(query.getQueries()[0]); q.setBoost(query.getBoosts().get(PatentQuery.getFields()[0])); bQuery.add(q, BooleanClause.Occur.MUST); } bQuery.add(bQueryFieldsExpanded, BooleanClause.Occur.MUST); // hits = searcher.search(bQuery, 100); // System.err.println(hits.totalHits + " total matching documents."); return bQuery; }
From source file:com.hrstc.lucene.queryexpansion.PatentRocchioQueryExpansion.java
License:Apache License
/** * Performs Rocchio's query expansion with pseudo feedback for each fields * separatlly qm = alpha * query + ( beta / relevanDocsCount ) * Sum ( rel * docs vector )/*from w ww . jav a 2 s. c o m*/ * * @param query * * @return expandedQuery * * @throws IOException * @throws ParseException */ @Override public Query expandQuery(PatentQuery query) throws ParseException, IOException { BooleanQuery bQuery = new BooleanQuery(); BooleanQuery bQueryFieldsExpanded = new BooleanQuery(); BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE); //***************************************************************** //**************** Compute the PRF for field (i)******************* //***************************************************************** TotalHitCountCollector collector = new TotalHitCountCollector(); searcher.search(query.parse(), collector); IndexReader ir = searcher.getIndexReader(); TopDocs hits = searcher.search(query.parse(), Math.max(1, collector.getTotalHits())); // Compute PRF set // System.err.println(hits.totalHits + " total matching documents for field " + query.getFields()[i] + "."); Query expandedQuery = null; RocchioQueryExpansion queryExpansion = new RocchioQueryExpansion(hits, ir, parameters, source, Nbr_Docs, Nbr_Terms); for (int i = 1; i < PatentQuery.getFields().length; i++) { if (query.getQueries()[i] != null && !query.getQueries()[i].equals("") && (i != 4 || i != 6) && query.getBoosts().get(PatentQuery.getFields()[i]) != 0) { QueryParser qp = new QueryParser(Version.LUCENE_48, PatentQuery.getFields()[i], new StandardAnalyzer(Version.LUCENE_48)); // BooleanQuery bQueryFields = new BooleanQuery();// Contain a field to make the PRF field by field Query q = qp.parse(query.getQueries()[i]); // if (query.isFilter()) { // Query filter = new QueryParser(Version.LUCENE_48, PatentQuery.getFields()[0], // new StandardAnalyzer(Version.LUCENE_48)).parse(query.getQueries()[0]); // bQueryFields.add(filter, BooleanClause.Occur.MUST); // } // if (!(q instanceof BooleanQuery) || ((BooleanQuery) q).getClauses().length > 0) { // bQueryFields.add(q, BooleanClause.Occur.MUST); // } if (expandedQuery == null) { expandedQuery = queryExpansion.expandQuery(q, PatentQuery.getFields()[i]); } else { BooleanQuery bq = ((BooleanQuery) expandedQuery).clone(); BooleanQuery bq2 = new BooleanQuery(); for (BooleanClause bc : bq.clauses()) { TermQuery tq = (TermQuery) bc.getQuery(); Term term = new Term(PatentQuery.getFields()[i], tq.getTerm().text()); TermQuery tq2 = new TermQuery(term); tq2.setBoost(tq.getBoost()); bq2.add(tq2, BooleanClause.Occur.SHOULD); } expandedQuery = bq2; } bQueryFieldsExpanded.add(expandedQuery, BooleanClause.Occur.SHOULD);// Compute the new expanded query based on PRF set // System.err.println("Expanded Query: " + expandedQuery); // hits = searcher.search(expandedQuery, 100); // System.err.println(hits.totalHits + " total matching documents"+ query.getFields()[i] + "."); } } if (query.isFilter()) { Query q = new QueryParser(Version.LUCENE_48, PatentQuery.getFields()[0], new StandardAnalyzer(Version.LUCENE_48)).parse(query.getQueries()[0]); q.setBoost(query.getBoosts().get(PatentQuery.getFields()[0])); bQuery.add(q, BooleanClause.Occur.MUST); } bQuery.add(bQueryFieldsExpanded, BooleanClause.Occur.MUST); // TopDocs hits = searcher.search(bQuery, 100); // System.err.println(hits.totalHits + " total matching documents."); return bQuery; }