Example usage for org.apache.lucene.index IndexReader getDocCount

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getDocCount.

Prototype

public abstract int getDocCount(String field) throws IOException;

Source Link

Document

Returns the number of documents that have at least one term for this field.

Usage

From source file:indexer.Retriever.java

private String getIDF(IndexReader reader, String word) throws IOException {
    ClassicSimilarity similarity = new ClassicSimilarity();
    int documentsFreq = 0;
    float idf = 0;

    Term term = new Term(documentField, word);
    int _documentsFreq = reader.docFreq(term);
    int documentsCount = reader.getDocCount(documentField);
    idf += similarity.idf(_documentsFreq, documentsCount);
    documentsFreq += _documentsFreq;/*from  w w w .j  a v a 2s .  c  o m*/

    String printString = word + ": " + idf + " (in " + documentsFreq + " documents)";
    return printString;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Select {@code R*N} docs from the ranking results and the index as the reranking pool.
 * The process is:/*w  w w  .  j  a  va 2  s  . co  m*/
 * 1. Keep the top R documents in the original ranking list
 * 2. Randomly pick {@code (N-1)*R} documents from the rest of the index so in total we have R*M documents
 *
 * @param docs The initial ranking results
 * @param context An instance of RerankerContext
 * @return a Set of {@code R*N} document Ids
 */
private Set<Integer> selectDocs(ScoredDocuments docs, RerankerContext<T> context) throws IOException {
    Set<Integer> docidSet = new HashSet<>(Arrays
            .asList(ArrayUtils.toObject(Arrays.copyOfRange(docs.ids, 0, Math.min(this.R, docs.ids.length)))));
    long targetSize = this.R * this.N;

    if (docidSet.size() < targetSize) {
        IndexReader reader;
        IndexSearcher searcher;
        if (this.externalIndexPath != null) {
            Path indexPath = Paths.get(this.externalIndexPath);
            if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
                throw new IllegalArgumentException(
                        this.externalIndexPath + " does not exist or is not a directory.");
            }
            reader = DirectoryReader.open(FSDirectory.open(indexPath));
            searcher = new IndexSearcher(reader);
        } else {
            searcher = context.getIndexSearcher();
            reader = searcher.getIndexReader();
        }
        int availableDocsCnt = reader.getDocCount(this.field);
        if (this.deterministic) { // internal docid cannot be relied due to multi-threads indexing,
                                  // we have to rely on external docid here
            Random random = new Random(this.seed);
            while (docidSet.size() < targetSize) {
                if (this.externalDocidsCache != null) {
                    String docid = this.externalDocidsCache
                            .get(random.nextInt(this.externalDocidsCache.size()));
                    Query q = new TermQuery(new Term(LuceneDocumentGenerator.FIELD_ID, docid));
                    TopDocs rs = searcher.search(q, 1);
                    docidSet.add(rs.scoreDocs[0].doc);
                } else {
                    docidSet.add(this.internalDocidsCache[random.nextInt(this.internalDocidsCache.length)].doc);
                }
            }
        } else {
            Random random = new Random();
            while (docidSet.size() < targetSize) {
                docidSet.add(random.nextInt(availableDocsCnt));
            }
        }
    }

    return docidSet;
}

From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java

public void iterateOverQueryTerms() throws ParseException, Exception {
    long start = System.currentTimeMillis();
    int l = 0;//from w  w  w.  ja v a 2s  .  c om
    //        System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t"
    //                + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims");

    System.out.println(
            "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims");
    for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) {
        l++;
        String queryid = e.getKey();
        PatentDocument pt = e.getValue();
        //            System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": ");
        long start2 = System.currentTimeMillis();
        PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords);
        BooleanQuery bQuery = (BooleanQuery) query.parse();
        if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery)
                || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0
                || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                        .getQuery() instanceof BooleanQuery)) {
            continue;
        }
        BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                .getQuery();
        for (int i = 0; i < bQuery2.clauses().size(); i++) {
            BooleanQuery bQueryFinal = new BooleanQuery();
            BooleanQuery bQuery3 = bQuery2.clone();
            BooleanClause removedBooleanClause = bQuery3.clauses().remove(i);
            bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST);
            bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST);
            //***************************
            // Get features
            //*************************** 
            IndexReader ir = searcher.getIndexSearch().getIndexReader();
            TermQuery term = (TermQuery) removedBooleanClause.getQuery();
            double tf = removedBooleanClause.getQuery().getBoost();// Term frequency
            double ln_tf = Math.log(1 + tf);// Get log of the term frequency
            int totalTF = ir.docFreq(term.getTerm());
            int docs = ir.getDocCount(term.getTerm().field());
            double idf = 0;
            if (totalTF != 0) {
                idf = Math.log10((double) docs / (totalTF));// Inverse document frequency
            }
            double tfidf = ln_tf * idf;// Compute the TFIDF
            int tLength = term.getTerm().text().length();// Term length
            int qSize = 0;
            if (term.getTerm().field().endsWith(PatentDocument.Title)) {
                qSize = query.getTitleSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) {
                qSize = query.getAbstractSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Description)) {
                qSize = query.getDescriptionSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) {
                qSize = query.getClaimsSize(); // Query size
            }
            double ratioTerm = (double) tf / qSize;
            int nbrUniqTerms = bQuery2.getClauses().length;
            long totalTermFreq = ir.totalTermFreq(term.getTerm());
            double ln_totalTermFreq = Math.log(1 + totalTermFreq);
            double scq = ln_totalTermFreq * idf;
            double freqInTitle = query.getFreqInTitle(term.getTerm().text());
            double ratioInTitle = (double) freqInTitle / query.getTitleSize();
            double freqAbstract = query.getFreqInAbstract(term.getTerm().text());
            double ratioInAbstract = (double) freqAbstract / query.getAbstractSize();
            double freqDescription = query.getFreqInDescription(term.getTerm().text());
            double ratioInDescription = (double) freqDescription / query.getDescriptionSize();
            double freqClaims = query.getFreqInClaims(term.getTerm().text());
            double ratioInClaims = (double) freqClaims / query.getClaimsSize();
            double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field());
            double SCS = 0;
            double ictf = 0;
            List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field());
            double a1 = 0;
            for (TermFreqVector vec : docsTermVector) {
                a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms());
            }
            double clarity = 0;
            if (totalTermFreq != 0) {
                SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score
                ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency
                clarity = a1 * Log2(a1 / Pcoll);
            }
            double QC = totalTF / (double) docs;// QueryScope

            //***************************
            System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf
                    + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize
                    + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle
                    + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t"
                    + freqClaims + "\t" + ratioInClaims);
        }
        long end2 = System.currentTimeMillis();
        //            System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + ".");
    }
    long end = System.currentTimeMillis();
    long millis = (end - start);
    System.err.println("#Global Execution time: " + Functions.getTimer(millis) + ".");
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper.java

License:Apache License

public static void updateSuggester(Directory directory, Analyzer analyzer, IndexReader reader)
        throws IOException {
    File tempDir = null;// w  w w . j a  v a2  s .  com
    try {
        //Analyzing infix suggester takes a file parameter. It uses its path to getDirectory()
        //for actual storage of suggester data. BUT, while building it also does getDirectory() to
        //a temporary location (original path + ".tmp"). So, instead we create a temp dir and also
        //create a placeholder non-existing-sub-child which would mark the location when we want to return
        //our internal suggestion OakDirectory. After build is done, we'd delete the temp directory
        //thereby removing any temp stuff that suggester created in the interim.
        tempDir = Files.createTempDir();
        File tempSubChild = new File(tempDir, "non-existing-sub-child");

        if (reader.getDocCount(FieldNames.SUGGEST) > 0) {
            Dictionary dictionary = new LuceneDictionary(reader, FieldNames.SUGGEST);
            getLookup(directory, analyzer, tempSubChild).build(dictionary);
        }
    } catch (RuntimeException e) {
        log.debug("could not update the suggester", e);
    } finally {
        //cleanup temp dir
        if (tempDir != null && !FileUtils.deleteQuietly(tempDir)) {
            log.error("Cleanup failed for temp dir {}", tempDir.getAbsolutePath());
        }
    }
}

From source file:org.esa.beam.occci.LuceneQueryIndexMain.java

License:Open Source License

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();//from  ww  w . j a  v  a  2 s. co m
    }
    File indexfile = new File(args[0]);
    File insituCSVtFile = new File(args[1]);
    if (!insituCSVtFile.exists()) {
        System.err.printf("insituList file '%s' does not exits%n", args[2]);
        printUsage();
    }
    int hours = 0;
    try {
        hours = Integer.parseInt(args[2]);
    } catch (NumberFormatException e) {
        e.printStackTrace();
        System.err.printf("cannot parse hours '%s' %n", args[3]);
        printUsage();
    }
    long maxTimeDifference = HOURS_IN_MILLIS * hours;

    final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS", ENGLISH);
    dateFormat.setCalendar(GregorianCalendar.getInstance(UTC, Locale.ENGLISH));

    List<SimpleRecord> insituRecords = ProductDBCheckerMain.readInsituRecords(insituCSVtFile);
    System.out.println("num insituRecords = " + insituRecords.size());

    Directory indexDirectory = FSDirectory.open(indexfile.toPath());
    IndexReader indexReader = DirectoryReader.open(indexDirectory);

    int numProductsInIndex = indexReader.getDocCount("name");
    System.out.println("numProductsInIndex = " + numProductsInIndex);

    IndexSearcher indexSearcher = new IndexSearcher(indexReader);

    DateRangePrefixTree dateRangePrefixTree = DateRangePrefixTree.INSTANCE;
    PrefixTreeStrategy strategy = new NumberRangePrefixTreeStrategy(dateRangePrefixTree, "productDateRange");

    SpatialOperation operation = SpatialOperation.Intersects;
    int hits = 0;
    long t1 = System.currentTimeMillis();
    Set<Integer> matches = new HashSet<>();

    Calendar calendar = dateRangePrefixTree.newCal();
    for (SimpleRecord insituRecord : insituRecords) {
        final long referenceTime = insituRecord.getTime();
        final long windowStartTime = referenceTime - maxTimeDifference;
        final long windowEndTime = referenceTime + maxTimeDifference;

        calendar.setTimeInMillis(windowStartTime);
        NumberRangePrefixTree.UnitNRShape leftShape = dateRangePrefixTree.toShape(calendar);
        calendar.setTimeInMillis(windowEndTime);
        NumberRangePrefixTree.UnitNRShape rightShape = dateRangePrefixTree.toShape(calendar);

        NumberRangePrefixTree.NRShape nrShape = dateRangePrefixTree.toRangeShape(leftShape, rightShape);
        SpatialArgs sargs = new SpatialArgs(operation, nrShape);
        Query query = strategy.makeQuery(sargs);

        TopDocs topDocs = indexSearcher.search(query, 1000);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            matches.add(scoreDoc.doc);
        }
        //                Document doc = indexSearcher.doc(docID);
        //                String productName = doc.get("name");
        //                matches.add(productName);
        //            }
        //            System.out.println("topDocs.totalHits = " + topDocs.totalHits);
        //            hits += topDocs.totalHits;
    }
    long t2 = System.currentTimeMillis();
    System.out.println("delta time test insitu = " + ((t2 - t1) / 1000f));

    System.out.println("hits = " + hits);
    System.out.println("matches = " + matches.size());

}