Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;/* ww w .ja va  2s.  co m*/

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:indexer.Cell.java

boolean toSplit(IndexReader reader) throws Exception {
    Cell parentCell = getCellIdOfParentCell();
    int df = 0;//from   w  ww .ja  v  a 2  s.co m
    int numDocs = 0;

    Term parentCellTerm = new Term(DocVector.FIELD_CELL_ID, parentCell.toString());
    Term thisCellTerm = new Term(DocVector.FIELD_CELL_ID, this.toString());

    // Find the number of cells in this strip, e.g.
    // a. if the current cell is 5_2, 
    numDocs = parentCell.validCell() ? reader.docFreq(parentCellTerm) : reader.numDocs();
    df = reader.docFreq(thisCellTerm);

    int uniformCount = numDocs / DocVector.numIntervals;
    return df > uniformCount;
}

From source file:indexer.Retriever.java

private String getIDF(IndexReader reader, String word) throws IOException {
    ClassicSimilarity similarity = new ClassicSimilarity();
    int documentsFreq = 0;
    float idf = 0;

    Term term = new Term(documentField, word);
    int _documentsFreq = reader.docFreq(term);
    int documentsCount = reader.getDocCount(documentField);
    idf += similarity.idf(_documentsFreq, documentsCount);
    documentsFreq += _documentsFreq;/* www. j a va2 s .c  om*/

    String printString = word + ": " + idf + " (in " + documentsFreq + " documents)";
    return printString;
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

private void dumpPostings(IndexReader reader) throws IOException {
    // This is how you iterate through terms in the postings list.
    LeafReader leafReader = reader.leaves().get(0).reader();
    TermsEnum termsEnum = leafReader.terms("text").iterator();
    BytesRef bytesRef = termsEnum.next();
    while (bytesRef != null) {
        // This is the current term in the dictionary.
        String token = bytesRef.utf8ToString();
        Term term = new Term("text", token);
        System.out.print(token + " (df = " + reader.docFreq(term) + "):");

        PostingsEnum postingsEnum = leafReader.postings(term);
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            System.out.print(String.format(" (%s, %s)", postingsEnum.docID(), postingsEnum.freq()));
        }//from   w  w w .  j  a v a 2s  . co  m
        System.out.println("");

        bytesRef = termsEnum.next();
    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testReadingPostings() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    assertEquals(3, reader.numDocs());//from   ww  w  .  j  av a 2s  . co  m
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();//from  w w  w  .  j  a va  2 s  .c  o m
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Calculate the scores (weights) of each term that occured in the reranking pool.
 * The Process:// ww w  . j a  va2  s.  c o  m
 * 1. For each query term, calculate its score for each term in the reranking pool. the score
 * is calcuated as
 * <pre>
 * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)}
 * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)}
 * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)}
 * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)}
 * </pre>
 * 2. For each query term the scores of every other term in the reranking pool are stored in a
 * PriorityQueue, only the top {@code K} are kept.
 * 3. Add the scores of the same term together and pick the top {@code M} ones.
 *
 * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs
 * @param context An instance of RerankerContext
 * @return Map<String, Double> Top terms and their weight scores in a HashMap
 */
private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList,
        RerankerContext<T> context) throws IOException {
    class ScoreComparator implements Comparator<Pair<String, Double>> {
        public int compare(Pair<String, Double> a, Pair<String, Double> b) {
            int cmp = Double.compare(b.getRight(), a.getRight());
            if (cmp == 0) {
                return a.getLeft().compareToIgnoreCase(b.getLeft());
            } else {
                return cmp;
            }
        }
    }

    // get collection statistics so that we can get idf later on.
    IndexReader reader;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
    } else {
        IndexSearcher searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs();

    //calculate the Mutual Information between term with each query term
    List<String> queryTerms = context.getQueryTokens();
    Map<String, Integer> queryTermsCounts = new HashMap<>();
    for (String qt : queryTerms) {
        queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1);
    }

    Set<Integer> allDocIds = new HashSet<>();
    for (Set<Integer> s : termInvertedList.values()) {
        allDocIds.addAll(s);
    }
    int docIdsCount = allDocIds.size();

    // Each priority queue corresponds to a query term: The p-queue itself stores all terms
    // in the reranking pool and their reranking scores to the query term.
    List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>();
    for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) {
        String queryTerm = q.getKey();
        long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm));
        if (df == 0L) {
            continue;
        }
        float idf = (float) Math.log((1 + docCount) / df);
        int qtf = q.getValue();
        if (termInvertedList.containsKey(queryTerm)) {
            PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator());
            double selfMI = computeMutualInformation(termInvertedList.get(queryTerm),
                    termInvertedList.get(queryTerm), docIdsCount);
            for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) {
                double score;
                if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1
                    score = idf * qtf;
                } else {
                    double crossMI = computeMutualInformation(termInvertedList.get(queryTerm),
                            termEntry.getValue(), docIdsCount);
                    score = idf * beta * qtf * crossMI / selfMI;
                }
                termScorePQ.add(Pair.of(termEntry.getKey(), score));
            }
            allTermScoresPQ.add(termScorePQ);
        }
    }

    Map<String, Double> aggTermScores = new HashMap<>();
    for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) {
        for (int i = 0; i < Math.min(termScores.size(), this.K); i++) {
            Pair<String, Double> termScore = termScores.poll();
            String term = termScore.getLeft();
            Double score = termScore.getRight();
            if (score - 0.0 > 1e-8) {
                aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score);
            }
        }
    }
    PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator());
    for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) {
        termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size()));
    }
    Map<String, Double> resultTermScores = new HashMap<>();
    for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) {
        Pair<String, Double> termScore = termScoresPQ.poll();
        String term = termScore.getKey();
        double score = termScore.getValue();
        resultTermScores.put(term, score);
    }

    return resultTermScores;
}

From source file:io.anserini.rerank.lib.Rm3Reranker.java

License:Apache License

private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) {
    FeatureVector f = new FeatureVector();

    try {/*from  w  w w. j  ava  2s  .c  o m*/
        int numDocs = reader.numDocs();
        TermsEnum termsEnum = terms.iterator();

        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();

            if (term.length() < 2 || term.length() > 20)
                continue;
            if (!term.matches("[a-z0-9]+"))
                continue;

            // This seemingly arbitrary logic needs some explanation. See following PR for details:
            //   https://github.com/castorini/Anserini/pull/289
            //
            // We have long known that stopwords have a big impact in RM3. If we include stopwords
            // in feedback, effectiveness is affected negatively. In the previous implementation, we
            // built custom stopwords lists by selecting top k terms from the collection. We only
            // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all
            // collections other than Twitter.
            //
            // The logic below instead uses a df threshold: If a term appears in more than n percent
            // of the documents, then it is discarded as a feedback term. This heuristic has the
            // advantage of getting rid of collection-specific stopwords lists, but at the cost of
            // introducing an additional tuning parameter.
            //
            // Cognizant of the dangers of (essentially) tuning on test data, here's what I
            // (@lintool) did:
            //
            // + For newswire collections, I picked a number, 10%, that seemed right. This value
            //   actually increased effectiveness in most conditions across all newswire collections.
            //
            // + This 10% value worked fine on web collections; effectiveness didn't change much.
            //
            // Since this was the first and only heuristic value I selected, we're not really tuning
            // parameters.
            //
            // The 10% threshold, however, doesn't work well on tweets because tweets are much
            // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection,
            // I found a threshold close to a nice round number that approximated the length of the
            // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did
            // this again for the Tweets2013 collection, using the same approach, and obtained a value
            // of 0.7%.
            //
            // With both values, we obtained effectiveness pretty close to the old values with the
            // custom stopwords list.
            int df = reader.docFreq(new Term(FIELD_BODY, term));
            float ratio = (float) df / numDocs;
            if (tweetsearch) {
                if (numDocs > 100000000) { // Probably Tweets2013
                    if (ratio > 0.007f)
                        continue;
                } else {
                    if (ratio > 0.01f)
                        continue;
                }
            } else if (ratio > 0.1f)
                continue;

            int freq = (int) termsEnum.totalTermFreq();
            f.addFeatureWeight(term, (float) freq);
        }
    } catch (Exception e) {
        e.printStackTrace();
        // Return empty feature vector
        return f;
    }

    return f;
}

From source file:io.anserini.util.ExtractTopDfTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    Args myArgs = new Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//from   ww  w .  j  av a  2 s.  c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    int numDocs = reader.numDocs();

    Comparator<Pair> comp = new Comparator<Pair>() {
        @Override
        public int compare(Pair p1, Pair p2) {
            if (p1.value == p2.value) {
                return p1.key.compareTo(p2.key);
            } else
                return (p1.value < p2.value) ? -1 : 1;
        }
    };

    PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp);

    LOG.info("Starting to iterate through all terms...");
    Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int cnt = 0;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        if (term.length() == 0)
            continue;

        Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term)));
        if (queue.size() < myArgs.topK) {
            queue.add(p);
        } else {
            if (comp.compare(p, queue.peek()) > 0) {
                queue.poll();
                queue.add(p);
            }
        }

        cnt++;
        if (cnt % 1000000 == 0) {
            LOG.info("At term " + term);
        }
    }

    PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
    Pair pair;
    while ((pair = queue.poll()) != null) {
        out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs));
    }
    out.close();

    LOG.info("Done!");
}

From source file:IR.LuceneModel.java

public static void main(String[] args) throws IOException {
    System.out.println(/*w  w  w  .  ja v  a  2 s  .  c  o  m*/
            "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

    String indexLocation = null;
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    String s = br.readLine();

    LuceneModel indexer = null;
    try {
        indexLocation = s;
        indexer = new LuceneModel(s);
    } catch (Exception ex) {
        System.out.println("Cannot create index..." + ex.getMessage());
        System.exit(-1);
    }

    // ===================================================
    // read input from user until he enters q for quit
    // ===================================================
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println(
                    "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            // try to add file into the index
            indexer.indexFileOrDirectory(s);
        } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
        }
    }

    // ===================================================
    // after adding, we always have to call the
    // closeIndex, otherwise the index is not created
    // ===================================================
    indexer.closeIndex();

    // =========================================================
    // Now search
    // =========================================================
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    IndexSearcher searcher = new IndexSearcher(reader);
    TopScoreDocCollector collector;//= TopScoreDocCollector.create(100, true);
    s = "";
    ScoreDoc[] hits;
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            File queryFile = new File(s);
            BufferedReader r = new BufferedReader(new FileReader(queryFile));

            String query;//= r.readLine();
            int count = 0;
            String q1 = "LuceneResults.txt";

            File luceneFile = new File(q1);
            luceneFile.createNewFile();
            FileWriter writer = new FileWriter(luceneFile);

            while ((query = r.readLine()) != null) {
                try {
                    count++;
                    collector = TopScoreDocCollector.create(100, true);
                    QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", analyzer);
                    Query q = parser.parse(query.replace('/', ' '));
                    searcher.search(q, collector);

                    hits = collector.topDocs().scoreDocs;

                    int query_id;
                    query_id = count; // change this for new query 

                    System.out.println("Found " + hits.length + " hits.");

                    for (int i = 0; i < hits.length; ++i) {
                        int docId = hits[i].doc;
                        Document d = searcher.doc(docId);
                        System.out.println(query_id + ". " + d.get("path").replaceAll(".html", "") + " "
                                + (i + 1) + " " + hits[i].score + " LuceneModel");
                        writer.write(String
                                .format(query_id + " " + "Q0" + " " + d.get("path").replaceAll(".html", "")
                                        + " " + (i + 1) + " " + hits[i].score + " LuceneModel\n"));
                        writer.flush();
                        //                    System.out.println(fmt.format(""+query_id,"Q0",""+d.get("path"),""+(i + 1),""+hits[i].score));

                    }

                } catch (Exception e) {
                    //            System.out.println(e.printStackTrace());
                    e.printStackTrace();
                    continue;
                }
                // 5. term stats --> watch out for which "version" of the term
                // must be checked here instead!
                Term termInstance = new Term("contents", s);
                long termFreq = reader.totalTermFreq(termInstance);
                long docCount = reader.docFreq(termInstance);
                System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount);
                //              r.close();
            }
            r.close();
            writer.close();
        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
            break;
        }

    }

}