Example usage for org.apache.lucene.index IndexReader getDocCount

List of usage examples for org.apache.lucene.index IndexReader getDocCount

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getDocCount.

Prototype

public abstract int getDocCount(String field) throws IOException;

Source Link

Document

Returns the number of documents that have at least one term for this field.

Usage

From source file:indexer.Retriever.java

private String getIDF(IndexReader reader, String word) throws IOException {
    ClassicSimilarity similarity = new ClassicSimilarity();
    int documentsFreq = 0;
    float idf = 0;

    Term term = new Term(documentField, word);
    int _documentsFreq = reader.docFreq(term);
    int documentsCount = reader.getDocCount(documentField);
    idf += similarity.idf(_documentsFreq, documentsCount);
    documentsFreq += _documentsFreq;/*from  w w w .j  a v a 2s .  c  o m*/

    String printString = word + ": " + idf + " (in " + documentsFreq + " documents)";
    return printString;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Select {@code R*N} docs from the ranking results and the index as the reranking pool.
 * The process is:/*w  w w  .  j  a  va 2  s  . co  m*/
 * 1. Keep the top R documents in the original ranking list
 * 2. Randomly pick {@code (N-1)*R} documents from the rest of the index so in total we have R*M documents
 *
 * @param docs The initial ranking results
 * @param context An instance of RerankerContext
 * @return a Set of {@code R*N} document Ids
 */
private Set<Integer> selectDocs(ScoredDocuments docs, RerankerContext<T> context) throws IOException {
    Set<Integer> docidSet = new HashSet<>(Arrays
            .asList(ArrayUtils.toObject(Arrays.copyOfRange(docs.ids, 0, Math.min(this.R, docs.ids.length)))));
    long targetSize = this.R * this.N;

    if (docidSet.size() < targetSize) {
        IndexReader reader;
        IndexSearcher searcher;
        if (this.externalIndexPath != null) {
            Path indexPath = Paths.get(this.externalIndexPath);
            if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
                throw new IllegalArgumentException(
                        this.externalIndexPath + " does not exist or is not a directory.");
            }
            reader = DirectoryReader.open(FSDirectory.open(indexPath));
            searcher = new IndexSearcher(reader);
        } else {
            searcher = context.getIndexSearcher();
            reader = searcher.getIndexReader();
        }
        int availableDocsCnt = reader.getDocCount(this.field);
        if (this.deterministic) { // internal docid cannot be relied due to multi-threads indexing,
                                  // we have to rely on external docid here
            Random random = new Random(this.seed);
            while (docidSet.size() < targetSize) {
                if (this.externalDocidsCache != null) {
                    String docid = this.externalDocidsCache
                            .get(random.nextInt(this.externalDocidsCache.size()));
                    Query q = new TermQuery(new Term(LuceneDocumentGenerator.FIELD_ID, docid));
                    TopDocs rs = searcher.search(q, 1);
                    docidSet.add(rs.scoreDocs[0].doc);
                } else {
                    docidSet.add(this.internalDocidsCache[random.nextInt(this.internalDocidsCache.length)].doc);
                }
            }
        } else {
            Random random = new Random();
            while (docidSet.size() < targetSize) {
                docidSet.add(random.nextInt(availableDocsCnt));
            }
        }
    }

    return docidSet;
}

From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java

public void iterateOverQueryTerms() throws ParseException, Exception {
    long start = System.currentTimeMillis();
    int l = 0;//from w  w  w.  ja v a 2s  .  c om
    //        System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t"
    //                + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims");

    System.out.println(
            "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims");
    for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) {
        l++;
        String queryid = e.getKey();
        PatentDocument pt = e.getValue();
        //            System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": ");
        long start2 = System.currentTimeMillis();
        PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords);
        BooleanQuery bQuery = (BooleanQuery) query.parse();
        if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery)
                || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0
                || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                        .getQuery() instanceof BooleanQuery)) {
            continue;
        }
        BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                .getQuery();
        for (int i = 0; i < bQuery2.clauses().size(); i++) {
            BooleanQuery bQueryFinal = new BooleanQuery();
            BooleanQuery bQuery3 = bQuery2.clone();
            BooleanClause removedBooleanClause = bQuery3.clauses().remove(i);
            bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST);
            bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST);
            //***************************
            // Get features
            //*************************** 
            IndexReader ir = searcher.getIndexSearch().getIndexReader();
            TermQuery term = (TermQuery) removedBooleanClause.getQuery();
            double tf = removedBooleanClause.getQuery().getBoost();// Term frequency
            double ln_tf = Math.log(1 + tf);// Get log of the term frequency
            int totalTF = ir.docFreq(term.getTerm());
            int docs = ir.getDocCount(term.getTerm().field());
            double idf = 0;
            if (totalTF != 0) {
                idf = Math.log10((double) docs / (totalTF));// Inverse document frequency
            }
            double tfidf = ln_tf * idf;// Compute the TFIDF
            int tLength = term.getTerm().text().length();// Term length
            int qSize = 0;
            if (term.getTerm().field().endsWith(PatentDocument.Title)) {
                qSize = query.getTitleSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) {
                qSize = query.getAbstractSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Description)) {
                qSize = query.getDescriptionSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) {
                qSize = query.getClaimsSize(); // Query size
            }
            double ratioTerm = (double) tf / qSize;
            int nbrUniqTerms = bQuery2.getClauses().length;
            long totalTermFreq = ir.totalTermFreq(term.getTerm());
            double ln_totalTermFreq = Math.log(1 + totalTermFreq);
            double scq = ln_totalTermFreq * idf;
            double freqInTitle = query.getFreqInTitle(term.getTerm().text());
            double ratioInTitle = (double) freqInTitle / query.getTitleSize();
            double freqAbstract = query.getFreqInAbstract(term.getTerm().text());
            double ratioInAbstract = (double) freqAbstract / query.getAbstractSize();
            double freqDescription = query.getFreqInDescription(term.getTerm().text());
            double ratioInDescription = (double) freqDescription / query.getDescriptionSize();
            double freqClaims = query.getFreqInClaims(term.getTerm().text());
            double ratioInClaims = (double) freqClaims / query.getClaimsSize();
            double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field());
            double SCS = 0;
            double ictf = 0;
            List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field());
            double a1 = 0;
            for (TermFreqVector vec : docsTermVector) {
                a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms());
            }
            double clarity = 0;
            if (totalTermFreq != 0) {
                SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score
                ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency
                clarity = a1 * Log2(a1 / Pcoll);
            }
            double QC = totalTF / (double) docs;// QueryScope

            //***************************
            System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf
                    + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize
                    + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle
                    + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t"
                    + freqClaims + "\t" + ratioInClaims);
        }
        long end2 = System.currentTimeMillis();
        //            System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + ".");
    }
    long end = System.currentTimeMillis();
    long millis = (end - start);
    System.err.println("#Global Execution time: " + Functions.getTimer(millis) + ".");
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper.java

License:Apache License

public static void updateSuggester(Directory directory, Analyzer analyzer, IndexReader reader)
        throws IOException {
    File tempDir = null;// w  w w . j a  v a2  s .  com
    try {
        //Analyzing infix suggester takes a file parameter. It uses its path to getDirectory()
        //for actual storage of suggester data. BUT, while building it also does getDirectory() to
        //a temporary location (original path + ".tmp"). So, instead we create a temp dir and also
        //create a placeholder non-existing-sub-child which would mark the location when we want to return
        //our internal suggestion OakDirectory. After build is done, we'd delete the temp directory
        //thereby removing any temp stuff that suggester created in the interim.
        tempDir = Files.createTempDir();
        File tempSubChild = new File(tempDir, "non-existing-sub-child");

        if (reader.getDocCount(FieldNames.SUGGEST) > 0) {
            Dictionary dictionary = new LuceneDictionary(reader, FieldNames.SUGGEST);
            getLookup(directory, analyzer, tempSubChild).build(dictionary);
        }
    } catch (RuntimeException e) {
        log.debug("could not update the suggester", e);
    } finally {
        //cleanup temp dir
        if (tempDir != null && !FileUtils.deleteQuietly(tempDir)) {
            log.error("Cleanup failed for temp dir {}", tempDir.getAbsolutePath());
        }
    }
}

From source file:org.esa.beam.occci.LuceneQueryIndexMain.java

License:Open Source License

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();//from  ww  w . j a  v  a  2 s. co m
    }
    File indexfile = new File(args[0]);
    File insituCSVtFile = new File(args[1]);
    if (!insituCSVtFile.exists()) {
        System.err.printf("insituList file '%s' does not exits%n", args[2]);
        printUsage();
    }
    int hours = 0;
    try {
        hours = Integer.parseInt(args[2]);
    } catch (NumberFormatException e) {
        e.printStackTrace();
        System.err.printf("cannot parse hours '%s' %n", args[3]);
        printUsage();
    }
    long maxTimeDifference = HOURS_IN_MILLIS * hours;

    final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS", ENGLISH);
    dateFormat.setCalendar(GregorianCalendar.getInstance(UTC, Locale.ENGLISH));

    List<SimpleRecord> insituRecords = ProductDBCheckerMain.readInsituRecords(insituCSVtFile);
    System.out.println("num insituRecords = " + insituRecords.size());

    Directory indexDirectory = FSDirectory.open(indexfile.toPath());
    IndexReader indexReader = DirectoryReader.open(indexDirectory);

    int numProductsInIndex = indexReader.getDocCount("name");
    System.out.println("numProductsInIndex = " + numProductsInIndex);

    IndexSearcher indexSearcher = new IndexSearcher(indexReader);

    DateRangePrefixTree dateRangePrefixTree = DateRangePrefixTree.INSTANCE;
    PrefixTreeStrategy strategy = new NumberRangePrefixTreeStrategy(dateRangePrefixTree, "productDateRange");

    SpatialOperation operation = SpatialOperation.Intersects;
    int hits = 0;
    long t1 = System.currentTimeMillis();
    Set<Integer> matches = new HashSet<>();

    Calendar calendar = dateRangePrefixTree.newCal();
    for (SimpleRecord insituRecord : insituRecords) {
        final long referenceTime = insituRecord.getTime();
        final long windowStartTime = referenceTime - maxTimeDifference;
        final long windowEndTime = referenceTime + maxTimeDifference;

        calendar.setTimeInMillis(windowStartTime);
        NumberRangePrefixTree.UnitNRShape leftShape = dateRangePrefixTree.toShape(calendar);
        calendar.setTimeInMillis(windowEndTime);
        NumberRangePrefixTree.UnitNRShape rightShape = dateRangePrefixTree.toShape(calendar);

        NumberRangePrefixTree.NRShape nrShape = dateRangePrefixTree.toRangeShape(leftShape, rightShape);
        SpatialArgs sargs = new SpatialArgs(operation, nrShape);
        Query query = strategy.makeQuery(sargs);

        TopDocs topDocs = indexSearcher.search(query, 1000);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            matches.add(scoreDoc.doc);
        }
        //                Document doc = indexSearcher.doc(docID);
        //                String productName = doc.get("name");
        //                matches.add(productName);
        //            }
        //            System.out.println("topDocs.totalHits = " + topDocs.totalHits);
        //            hits += topDocs.totalHits;
    }
    long t2 = System.currentTimeMillis();
    System.out.println("delta time test insitu = " + ((t2 - t1) / 1000f));

    System.out.println("hits = " + hits);
    System.out.println("matches = " + matches.size());

}