Example usage for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;/* ww w .ja va  2s.  co m*/

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:indexer.Cell.java

boolean toSplit(IndexReader reader) throws Exception {
    Cell parentCell = getCellIdOfParentCell();
    int df = 0;//from   w  ww .ja  v  a 2  s.co m
    int numDocs = 0;

    Term parentCellTerm = new Term(DocVector.FIELD_CELL_ID, parentCell.toString());
    Term thisCellTerm = new Term(DocVector.FIELD_CELL_ID, this.toString());

    // Find the number of cells in this strip, e.g.
    // a. if the current cell is 5_2, 
    numDocs = parentCell.validCell() ? reader.docFreq(parentCellTerm) : reader.numDocs();
    df = reader.docFreq(thisCellTerm);

    int uniformCount = numDocs / DocVector.numIntervals;
    return df > uniformCount;
}

From source file:indexer.Retriever.java

private String getIDF(IndexReader reader, String word) throws IOException {
    ClassicSimilarity similarity = new ClassicSimilarity();
    int documentsFreq = 0;
    float idf = 0;

    Term term = new Term(documentField, word);
    int _documentsFreq = reader.docFreq(term);
    int documentsCount = reader.getDocCount(documentField);
    idf += similarity.idf(_documentsFreq, documentsCount);
    documentsFreq += _documentsFreq;/* www. j a va2 s .c  om*/

    String printString = word + ": " + idf + " (in " + documentsFreq + " documents)";
    return printString;
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

private void dumpPostings(IndexReader reader) throws IOException {
    // This is how you iterate through terms in the postings list.
    LeafReader leafReader = reader.leaves().get(0).reader();
    TermsEnum termsEnum = leafReader.terms("text").iterator();
    BytesRef bytesRef = termsEnum.next();
    while (bytesRef != null) {
        // This is the current term in the dictionary.
        String token = bytesRef.utf8ToString();
        Term term = new Term("text", token);
        System.out.print(token + " (df = " + reader.docFreq(term) + "):");

        PostingsEnum postingsEnum = leafReader.postings(term);
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            System.out.print(String.format(" (%s, %s)", postingsEnum.docID(), postingsEnum.freq()));
        }//from   w  w w .  j  a v a 2s  . co  m
        System.out.println("");

        bytesRef = termsEnum.next();
    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testReadingPostings() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    assertEquals(3, reader.numDocs());//from   ww  w  .  j  av a 2s  . co  m
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();//from  w w  w  .  j  a va  2 s  .c  o m
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Calculate the scores (weights) of each term that occured in the reranking pool.
 * The Process:// ww w  . j a  va2  s.  c o  m
 * 1. For each query term, calculate its score for each term in the reranking pool. the score
 * is calcuated as
 * <pre>
 * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)}
 * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)}
 * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)}
 * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)}
 * </pre>
 * 2. For each query term the scores of every other term in the reranking pool are stored in a
 * PriorityQueue, only the top {@code K} are kept.
 * 3. Add the scores of the same term together and pick the top {@code M} ones.
 *
 * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs
 * @param context An instance of RerankerContext
 * @return Map<String, Double> Top terms and their weight scores in a HashMap
 */
private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList,
        RerankerContext<T> context) throws IOException {
    class ScoreComparator implements Comparator<Pair<String, Double>> {
        public int compare(Pair<String, Double> a, Pair<String, Double> b) {
            int cmp = Double.compare(b.getRight(), a.getRight());
            if (cmp == 0) {
                return a.getLeft().compareToIgnoreCase(b.getLeft());
            } else {
                return cmp;
            }
        }
    }

    // get collection statistics so that we can get idf later on.
    IndexReader reader;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
    } else {
        IndexSearcher searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs();

    //calculate the Mutual Information between term with each query term
    List<String> queryTerms = context.getQueryTokens();
    Map<String, Integer> queryTermsCounts = new HashMap<>();
    for (String qt : queryTerms) {
        queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1);
    }

    Set<Integer> allDocIds = new HashSet<>();
    for (Set<Integer> s : termInvertedList.values()) {
        allDocIds.addAll(s);
    }
    int docIdsCount = allDocIds.size();

    // Each priority queue corresponds to a query term: The p-queue itself stores all terms
    // in the reranking pool and their reranking scores to the query term.
    List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>();
    for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) {
        String queryTerm = q.getKey();
        long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm));
        if (df == 0L) {
            continue;
        }
        float idf = (float) Math.log((1 + docCount) / df);
        int qtf = q.getValue();
        if (termInvertedList.containsKey(queryTerm)) {
            PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator());
            double selfMI = computeMutualInformation(termInvertedList.get(queryTerm),
                    termInvertedList.get(queryTerm), docIdsCount);
            for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) {
                double score;
                if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1
                    score = idf * qtf;
                } else {
                    double crossMI = computeMutualInformation(termInvertedList.get(queryTerm),
                            termEntry.getValue(), docIdsCount);
                    score = idf * beta * qtf * crossMI / selfMI;
                }
                termScorePQ.add(Pair.of(termEntry.getKey(), score));
            }
            allTermScoresPQ.add(termScorePQ);
        }
    }

    Map<String, Double> aggTermScores = new HashMap<>();
    for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) {
        for (int i = 0; i < Math.min(termScores.size(), this.K); i++) {
            Pair<String, Double> termScore = termScores.poll();
            String term = termScore.getLeft();
            Double score = termScore.getRight();
            if (score - 0.0 > 1e-8) {
                aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score);
            }
        }
    }
    PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator());
    for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) {
        termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size()));
    }
    Map<String, Double> resultTermScores = new HashMap<>();
    for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) {
        Pair<String, Double> termScore = termScoresPQ.poll();
        String term = termScore.getKey();
        double score = termScore.getValue();
        resultTermScores.put(term, score);
    }

    return resultTermScores;
}

From source file:io.anserini.rerank.lib.Rm3Reranker.java

License:Apache License

private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) {
    FeatureVector f = new FeatureVector();

    try {/*from  w  w w. j  ava  2s  .c  o m*/
        int numDocs = reader.numDocs();
        TermsEnum termsEnum = terms.iterator();

        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();

            if (term.length() < 2 || term.length() > 20)
                continue;
            if (!term.matches("[a-z0-9]+"))
                continue;

            // This seemingly arbitrary logic needs some explanation. See following PR for details:
            //   https://github.com/castorini/Anserini/pull/289
            //
            // We have long known that stopwords have a big impact in RM3. If we include stopwords
            // in feedback, effectiveness is affected negatively. In the previous implementation, we
            // built custom stopwords lists by selecting top k terms from the collection. We only
            // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all
            // collections other than Twitter.
            //
            // The logic below instead uses a df threshold: If a term appears in more than n percent
            // of the documents, then it is discarded as a feedback term. This heuristic has the
            // advantage of getting rid of collection-specific stopwords lists, but at the cost of
            // introducing an additional tuning parameter.
            //
            // Cognizant of the dangers of (essentially) tuning on test data, here's what I
            // (@lintool) did:
            //
            // + For newswire collections, I picked a number, 10%, that seemed right. This value
            //   actually increased effectiveness in most conditions across all newswire collections.
            //
            // + This 10% value worked fine on web collections; effectiveness didn't change much.
            //
            // Since this was the first and only heuristic value I selected, we're not really tuning
            // parameters.
            //
            // The 10% threshold, however, doesn't work well on tweets because tweets are much
            // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection,
            // I found a threshold close to a nice round number that approximated the length of the
            // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did
            // this again for the Tweets2013 collection, using the same approach, and obtained a value
            // of 0.7%.
            //
            // With both values, we obtained effectiveness pretty close to the old values with the
            // custom stopwords list.
            int df = reader.docFreq(new Term(FIELD_BODY, term));
            float ratio = (float) df / numDocs;
            if (tweetsearch) {
                if (numDocs > 100000000) { // Probably Tweets2013
                    if (ratio > 0.007f)
                        continue;
                } else {
                    if (ratio > 0.01f)
                        continue;
                }
            } else if (ratio > 0.1f)
                continue;

            int freq = (int) termsEnum.totalTermFreq();
            f.addFeatureWeight(term, (float) freq);
        }
    } catch (Exception e) {
        e.printStackTrace();
        // Return empty feature vector
        return f;
    }

    return f;
}

From source file:io.anserini.util.ExtractTopDfTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    Args myArgs = new Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//from   ww  w .  j  av a  2 s.  c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    int numDocs = reader.numDocs();

    Comparator<Pair> comp = new Comparator<Pair>() {
        @Override
        public int compare(Pair p1, Pair p2) {
            if (p1.value == p2.value) {
                return p1.key.compareTo(p2.key);
            } else
                return (p1.value < p2.value) ? -1 : 1;
        }
    };

    PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp);

    LOG.info("Starting to iterate through all terms...");
    Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int cnt = 0;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        if (term.length() == 0)
            continue;

        Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term)));
        if (queue.size() < myArgs.topK) {
            queue.add(p);
        } else {
            if (comp.compare(p, queue.peek()) > 0) {
                queue.poll();
                queue.add(p);
            }
        }

        cnt++;
        if (cnt % 1000000 == 0) {
            LOG.info("At term " + term);
        }
    }

    PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
    Pair pair;
    while ((pair = queue.poll()) != null) {
        out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs));
    }
    out.close();

    LOG.info("Done!");
}

From source file:IR.LuceneModel.java

public static void main(String[] args) throws IOException {
    System.out.println(/*w  w  w  .  ja v  a  2 s  .  c  o  m*/
            "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

    String indexLocation = null;
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    String s = br.readLine();

    LuceneModel indexer = null;
    try {
        indexLocation = s;
        indexer = new LuceneModel(s);
    } catch (Exception ex) {
        System.out.println("Cannot create index..." + ex.getMessage());
        System.exit(-1);
    }

    // ===================================================
    // read input from user until he enters q for quit
    // ===================================================
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println(
                    "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            // try to add file into the index
            indexer.indexFileOrDirectory(s);
        } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
        }
    }

    // ===================================================
    // after adding, we always have to call the
    // closeIndex, otherwise the index is not created
    // ===================================================
    indexer.closeIndex();

    // =========================================================
    // Now search
    // =========================================================
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    IndexSearcher searcher = new IndexSearcher(reader);
    TopScoreDocCollector collector;//= TopScoreDocCollector.create(100, true);
    s = "";
    ScoreDoc[] hits;
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            File queryFile = new File(s);
            BufferedReader r = new BufferedReader(new FileReader(queryFile));

            String query;//= r.readLine();
            int count = 0;
            String q1 = "LuceneResults.txt";

            File luceneFile = new File(q1);
            luceneFile.createNewFile();
            FileWriter writer = new FileWriter(luceneFile);

            while ((query = r.readLine()) != null) {
                try {
                    count++;
                    collector = TopScoreDocCollector.create(100, true);
                    QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", analyzer);
                    Query q = parser.parse(query.replace('/', ' '));
                    searcher.search(q, collector);

                    hits = collector.topDocs().scoreDocs;

                    int query_id;
                    query_id = count; // change this for new query 

                    System.out.println("Found " + hits.length + " hits.");

                    for (int i = 0; i < hits.length; ++i) {
                        int docId = hits[i].doc;
                        Document d = searcher.doc(docId);
                        System.out.println(query_id + ". " + d.get("path").replaceAll(".html", "") + " "
                                + (i + 1) + " " + hits[i].score + " LuceneModel");
                        writer.write(String
                                .format(query_id + " " + "Q0" + " " + d.get("path").replaceAll(".html", "")
                                        + " " + (i + 1) + " " + hits[i].score + " LuceneModel\n"));
                        writer.flush();
                        //                    System.out.println(fmt.format(""+query_id,"Q0",""+d.get("path"),""+(i + 1),""+hits[i].score));

                    }

                } catch (Exception e) {
                    //            System.out.println(e.printStackTrace());
                    e.printStackTrace();
                    continue;
                }
                // 5. term stats --> watch out for which "version" of the term
                // must be checked here instead!
                Term termInstance = new Term("contents", s);
                long termFreq = reader.totalTermFreq(termInstance);
                long docCount = reader.docFreq(termInstance);
                System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount);
                //              r.close();
            }
            r.close();
            writer.close();
        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
            break;
        }

    }

}