List of usage examples for org.apache.lucene.index IndexReader getDocCount
public abstract int getDocCount(String field) throws IOException;
From source file:indexer.Retriever.java
private String getIDF(IndexReader reader, String word) throws IOException { ClassicSimilarity similarity = new ClassicSimilarity(); int documentsFreq = 0; float idf = 0; Term term = new Term(documentField, word); int _documentsFreq = reader.docFreq(term); int documentsCount = reader.getDocCount(documentField); idf += similarity.idf(_documentsFreq, documentsCount); documentsFreq += _documentsFreq;/*from w w w .j a v a 2s . c o m*/ String printString = word + ": " + idf + " (in " + documentsFreq + " documents)"; return printString; }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Select {@code R*N} docs from the ranking results and the index as the reranking pool. * The process is:/*w w w . j a va 2 s . co m*/ * 1. Keep the top R documents in the original ranking list * 2. Randomly pick {@code (N-1)*R} documents from the rest of the index so in total we have R*M documents * * @param docs The initial ranking results * @param context An instance of RerankerContext * @return a Set of {@code R*N} document Ids */ private Set<Integer> selectDocs(ScoredDocuments docs, RerankerContext<T> context) throws IOException { Set<Integer> docidSet = new HashSet<>(Arrays .asList(ArrayUtils.toObject(Arrays.copyOfRange(docs.ids, 0, Math.min(this.R, docs.ids.length))))); long targetSize = this.R * this.N; if (docidSet.size() < targetSize) { IndexReader reader; IndexSearcher searcher; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); searcher = new IndexSearcher(reader); } else { searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } int availableDocsCnt = reader.getDocCount(this.field); if (this.deterministic) { // internal docid cannot be relied due to multi-threads indexing, // we have to rely on external docid here Random random = new Random(this.seed); while (docidSet.size() < targetSize) { if (this.externalDocidsCache != null) { String docid = this.externalDocidsCache .get(random.nextInt(this.externalDocidsCache.size())); Query q = new TermQuery(new Term(LuceneDocumentGenerator.FIELD_ID, docid)); TopDocs rs = searcher.search(q, 1); docidSet.add(rs.scoreDocs[0].doc); } else { docidSet.add(this.internalDocidsCache[random.nextInt(this.internalDocidsCache.length)].doc); } } } else { Random random = new Random(); while (docidSet.size() < targetSize) { docidSet.add(random.nextInt(availableDocsCnt)); } } } return docidSet; }
From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java
public void iterateOverQueryTerms() throws ParseException, Exception { long start = System.currentTimeMillis(); int l = 0;//from w w w. ja v a 2s . c om // System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t" // + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims"); System.out.println( "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims"); for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) { l++; String queryid = e.getKey(); PatentDocument pt = e.getValue(); // System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": "); long start2 = System.currentTimeMillis(); PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords); BooleanQuery bQuery = (BooleanQuery) query.parse(); if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery) || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0 || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery() instanceof BooleanQuery)) { continue; } BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery(); for (int i = 0; i < bQuery2.clauses().size(); i++) { BooleanQuery bQueryFinal = new BooleanQuery(); BooleanQuery bQuery3 = bQuery2.clone(); BooleanClause removedBooleanClause = bQuery3.clauses().remove(i); bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST); bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST); //*************************** // Get features //*************************** IndexReader ir = searcher.getIndexSearch().getIndexReader(); TermQuery term = (TermQuery) removedBooleanClause.getQuery(); double tf = removedBooleanClause.getQuery().getBoost();// Term frequency double ln_tf = Math.log(1 + tf);// Get log of the term frequency int totalTF = ir.docFreq(term.getTerm()); int docs = ir.getDocCount(term.getTerm().field()); double idf = 0; if (totalTF != 0) { idf = Math.log10((double) docs / (totalTF));// Inverse document frequency } double tfidf = ln_tf * idf;// Compute the TFIDF int tLength = term.getTerm().text().length();// Term length int qSize = 0; if (term.getTerm().field().endsWith(PatentDocument.Title)) { qSize = query.getTitleSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) { qSize = query.getAbstractSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Description)) { qSize = query.getDescriptionSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) { qSize = query.getClaimsSize(); // Query size } double ratioTerm = (double) tf / qSize; int nbrUniqTerms = bQuery2.getClauses().length; long totalTermFreq = ir.totalTermFreq(term.getTerm()); double ln_totalTermFreq = Math.log(1 + totalTermFreq); double scq = ln_totalTermFreq * idf; double freqInTitle = query.getFreqInTitle(term.getTerm().text()); double ratioInTitle = (double) freqInTitle / query.getTitleSize(); double freqAbstract = query.getFreqInAbstract(term.getTerm().text()); double ratioInAbstract = (double) freqAbstract / query.getAbstractSize(); double freqDescription = query.getFreqInDescription(term.getTerm().text()); double ratioInDescription = (double) freqDescription / query.getDescriptionSize(); double freqClaims = query.getFreqInClaims(term.getTerm().text()); double ratioInClaims = (double) freqClaims / query.getClaimsSize(); double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field()); double SCS = 0; double ictf = 0; List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field()); double a1 = 0; for (TermFreqVector vec : docsTermVector) { a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms()); } double clarity = 0; if (totalTermFreq != 0) { SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency clarity = a1 * Log2(a1 / Pcoll); } double QC = totalTF / (double) docs;// QueryScope //*************************** System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t" + freqClaims + "\t" + ratioInClaims); } long end2 = System.currentTimeMillis(); // System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + "."); } long end = System.currentTimeMillis(); long millis = (end - start); System.err.println("#Global Execution time: " + Functions.getTimer(millis) + "."); }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper.java
License:Apache License
public static void updateSuggester(Directory directory, Analyzer analyzer, IndexReader reader) throws IOException { File tempDir = null;// w w w . j a v a2 s . com try { //Analyzing infix suggester takes a file parameter. It uses its path to getDirectory() //for actual storage of suggester data. BUT, while building it also does getDirectory() to //a temporary location (original path + ".tmp"). So, instead we create a temp dir and also //create a placeholder non-existing-sub-child which would mark the location when we want to return //our internal suggestion OakDirectory. After build is done, we'd delete the temp directory //thereby removing any temp stuff that suggester created in the interim. tempDir = Files.createTempDir(); File tempSubChild = new File(tempDir, "non-existing-sub-child"); if (reader.getDocCount(FieldNames.SUGGEST) > 0) { Dictionary dictionary = new LuceneDictionary(reader, FieldNames.SUGGEST); getLookup(directory, analyzer, tempSubChild).build(dictionary); } } catch (RuntimeException e) { log.debug("could not update the suggester", e); } finally { //cleanup temp dir if (tempDir != null && !FileUtils.deleteQuietly(tempDir)) { log.error("Cleanup failed for temp dir {}", tempDir.getAbsolutePath()); } } }
From source file:org.esa.beam.occci.LuceneQueryIndexMain.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length != 3) { printUsage();//from ww w . j a v a 2 s. co m } File indexfile = new File(args[0]); File insituCSVtFile = new File(args[1]); if (!insituCSVtFile.exists()) { System.err.printf("insituList file '%s' does not exits%n", args[2]); printUsage(); } int hours = 0; try { hours = Integer.parseInt(args[2]); } catch (NumberFormatException e) { e.printStackTrace(); System.err.printf("cannot parse hours '%s' %n", args[3]); printUsage(); } long maxTimeDifference = HOURS_IN_MILLIS * hours; final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS", ENGLISH); dateFormat.setCalendar(GregorianCalendar.getInstance(UTC, Locale.ENGLISH)); List<SimpleRecord> insituRecords = ProductDBCheckerMain.readInsituRecords(insituCSVtFile); System.out.println("num insituRecords = " + insituRecords.size()); Directory indexDirectory = FSDirectory.open(indexfile.toPath()); IndexReader indexReader = DirectoryReader.open(indexDirectory); int numProductsInIndex = indexReader.getDocCount("name"); System.out.println("numProductsInIndex = " + numProductsInIndex); IndexSearcher indexSearcher = new IndexSearcher(indexReader); DateRangePrefixTree dateRangePrefixTree = DateRangePrefixTree.INSTANCE; PrefixTreeStrategy strategy = new NumberRangePrefixTreeStrategy(dateRangePrefixTree, "productDateRange"); SpatialOperation operation = SpatialOperation.Intersects; int hits = 0; long t1 = System.currentTimeMillis(); Set<Integer> matches = new HashSet<>(); Calendar calendar = dateRangePrefixTree.newCal(); for (SimpleRecord insituRecord : insituRecords) { final long referenceTime = insituRecord.getTime(); final long windowStartTime = referenceTime - maxTimeDifference; final long windowEndTime = referenceTime + maxTimeDifference; calendar.setTimeInMillis(windowStartTime); NumberRangePrefixTree.UnitNRShape leftShape = dateRangePrefixTree.toShape(calendar); calendar.setTimeInMillis(windowEndTime); NumberRangePrefixTree.UnitNRShape rightShape = dateRangePrefixTree.toShape(calendar); NumberRangePrefixTree.NRShape nrShape = dateRangePrefixTree.toRangeShape(leftShape, rightShape); SpatialArgs sargs = new SpatialArgs(operation, nrShape); Query query = strategy.makeQuery(sargs); TopDocs topDocs = indexSearcher.search(query, 1000); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { matches.add(scoreDoc.doc); } // Document doc = indexSearcher.doc(docID); // String productName = doc.get("name"); // matches.add(productName); // } // System.out.println("topDocs.totalHits = " + topDocs.totalHits); // hits += topDocs.totalHits; } long t2 = System.currentTimeMillis(); System.out.println("delta time test insitu = " + ((t2 - t1) / 1000f)); System.out.println("hits = " + hits); System.out.println("matches = " + matches.size()); }