Example usage for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

/**
 * Builds the TFIDF vector and its norm2
 * /*from  ww  w  .  j  av a  2s .  co m*/
 * @param tfidf
 *            - the vector containing for each term its TFIDF score, it will
 *            be populated by this method
 * @param freq
 *            - the vector containing for each term its frequency
 * @param field
 *            - the field on which to compute the inverse document frequency
 * 
 * @return the norm of the TFIDF vector
 * 
 */
private double tfidfVector(Map<String, Double> tfidf, Map<String, Integer> freq, String field) {
    IndexReader reader = getReader();

    double norm = 0;
    for (Map.Entry<String, Integer> entry : freq.entrySet()) {
        Term t = new Term(field, entry.getKey());
        int df = 0;
        try {
            df = reader.docFreq(t);
        } catch (IOException e) {
            logger.error("computing tfidfVector ({}) ", e.toString());
            System.exit(-1);
        }
        double idf = Math.log(collectionSize / (double) df + 1) / Math.log(2) + 1;
        double tfidfValue = entry.getValue() * idf;
        norm += tfidfValue * tfidfValue;
        tfidf.put(entry.getKey(), tfidfValue);
    }
    return Math.sqrt(norm);

}

From source file:it.unibz.instasearch.indexing.SearchResultDoc.java

License:Open Source License

private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException {
    if (vect == null)
        return new float[0];

    int[] termFrequencies = vect.getTermFrequencies();
    String[] terms = vect.getTerms();
    float[] scores = new float[terms.length];

    int numDocs = reader.maxDoc();
    Similarity sim = Searcher.SIMILARITY;

    for (int i = 0; i < terms.length; i++) {
        String termText = terms[i];
        Term term = new Term(Field.CONTENTS.toString(), termText);

        float termFreq = sim.tf(termFrequencies[i]);

        int docFreq = reader.docFreq(term);
        float idf = sim.idf(docFreq, numDocs);

        float tfIdf = termFreq * idf;

        scores[i] = tfIdf;//from  ww  w .  j a va 2s.  c  om
    }

    return scores;
}

From source file:it.unipd.dei.ims.falcon.indexing.Indexing.java

License:Apache License

/**
 * Indexes all the songs in the specified path.
 * The index is created in the specified directory "indexPath". If an index
 * already exists in that path, adds the songs to the existing index.
 * Each song is processed by the method/*  w  ww. j a v a  2 s  .  c o m*/
 * {@link it.unipd.dei.ims.falcon.indexing.Indexing#indexSong}
 * which maps the song into a set of segments, each of one is mapped in a
 * Lucene {@link org.apache.lucene.document.Document}.
 * The segments have fixed length, specifically are constituted by 
 * "hashPerSegment" hashes. There can be an overlap of "hashInOverlap"
 * hashes between two segments. The number of hash in the overlap must be
 * smaller than the number of hash per segments, otherwise an
 * {@link it.unipd.dei.ims.falcon.indexing.IndexingException} is thrown.
 * <p>
 * Once the index has been created or updated, writes a map into a file.
 * The map associates a set of features to each hash. Those features are
 * based on occurrence statistics of the hash in the entire collection.
 * In the event of an index update the map is re-built and the map file
 * is over-written.
 * @param data Input file. If it is a directory, index all files inside it.
 * @param index Falcon index.
 * @param hashPerSegment Number of hashes per segment.
 * @param hashInOverlap Number of overlapping hashes per segment.
 * @throws IndexingException 
 */
public static void index(File data, File index, final int hashPerSegment, final int hashInOverlap,
        final int subsampling, final int nranks, final double minkurtosis,
        final TranspositionEstimator transpEst, boolean verbose) throws IndexingException, IOException {

    long start_time = System.currentTimeMillis();

    if (hashPerSegment <= hashInOverlap)
        throw new IndexingException(
                "Number of hashes in the overlap cannot be equal to the number of hash per segment");

    if (!data.canRead())
        throw new IOException("cannot read input path");
    if (data.isDirectory()) {
        for (File f : data.listFiles())
            if (!f.canRead())
                throw new IOException("cannot read one or more input files");
    }

    if (!index.exists()) // if index is being created rather than updated
        index.mkdir();
    if (!index.canWrite())
        throw new IOException("cannot write to index directory");

    SimpleFSDirectory indexDir = new SimpleFSDirectory(index, null);

    // initialize Lucene Analyzer and IndexWriter
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    final IndexWriter writer = new IndexWriter(indexDir, analyzer, !IndexReader.indexExists(indexDir),
            IndexWriter.MaxFieldLength.UNLIMITED);
    writer.setSimilarity(new HashSimilarity());

    // transform chroma data into hashes and write into index
    File[] inputfiles = data.isDirectory() ? data.listFiles() : new File[] { data };
    int fileNo = 0;
    for (final File file : inputfiles) {
        // if the current considered files exists and is not hidden
        if (file.exists() && !file.getName().startsWith(".")) {
            if (verbose)
                System.out.println(String.format("%10.3f%% - indexing %s", fileNo * 100. / inputfiles.length,
                        file.getAbsolutePath()));
            final List<OutputStream> fout = new LinkedList<OutputStream>();
            fout.add(new PipedOutputStream());
            final PipedInputStream fin = new PipedInputStream((PipedOutputStream) fout.get(0));
            Thread t = new Thread(new Runnable() {
                public void run() {
                    try {
                        ChromaMatrixUtils.convertChromaStreamIntoHashesStream(new FileReader(file), fout,
                                nranks, transpEst, minkurtosis, subsampling);
                    } catch (IOException ex) {
                        // TODO do something better for this exception ... (might hang all ...)
                        Logger.getLogger(Indexing.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            });
            t.start();
            indexSong(writer, fin, hashPerSegment, hashInOverlap, file.getAbsolutePath(),
                    file.getAbsolutePath());
            fileNo++;
        }
    }
    writer.optimize();
    writer.close();

    // additional falcon features
    PrintWriter pw = new PrintWriter(index.getAbsolutePath() + "/qpruning_features.map");
    IndexReader reader = IndexReader.open(new SimpleFSDirectory(index));
    int numSegments = reader.numDocs();
    long total_hcf = numSegments * hashPerSegment; // total number of hashes in the collection
    TermEnum hashes = reader.terms(); // distinct hashes in the collection

    while (hashes.next()) {
        if (!hashes.term().field().equals("CONTENT")) {
            continue;
        }
        Term curHash = hashes.term();
        pw.print(curHash.text() + "\t");
        pw.print((double) reader.docFreq(curHash) / numSegments + "\t"); // normalized document frequency
        TermDocs curHash_pl = reader.termDocs(curHash); // posting list for the current hash
        // computation of the frequency of the current hash in the
        // entire collection -- value initialization
        long hcf = 0;
        // initializes the normalized maximum frequency value
        double nmf = 0;
        // initializes the normalized frequency for max computation
        double cur_nf = 0;
        // processes posting list entries
        while (curHash_pl.next()) {
            // computation of the normalized frequency for
            // the current hash
            cur_nf = (double) curHash_pl.freq() / hashPerSegment;
            // update max if necessary
            if (cur_nf > nmf) {
                nmf = cur_nf;
            }
            hcf += curHash_pl.freq();
        }
        // prints normalized total collection frequency and
        // normalized maximum frequency for the current hash
        pw.print((double) hcf / total_hcf + "\t" + nmf + "\n");
    }
    pw.flush();
    pw.close();

    long end_time = System.currentTimeMillis();
    if (verbose)
        System.out.println(String.format("[INDEXING] - elapsed time: %10.3f", (end_time - start_time) / 1000.));

}

From source file:lucene.searchengine.LuceneSearchEngine.java

public static void main(String[] args) throws IOException {
    System.out.println(/*from w w w. j  a va  2  s . c o  m*/
            "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

    String indexLocation = null;
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    String s = br.readLine();

    LuceneSearchEngine indexer = null;
    try {
        indexLocation = s;
        indexer = new LuceneSearchEngine(s);
    } catch (Exception ex) {
        System.out.println("Cannot create index..." + ex.getMessage());
        System.exit(-1);
    }

    // ===================================================
    // read input from user until he enters q for quit
    // ===================================================
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println(
                    "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            // try to add file into the index
            indexer.indexFileOrDirectory(s);
        } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
        }
    }

    // ===================================================
    // after adding, we always have to call the
    // closeIndex, otherwise the index is not created
    // ===================================================
    indexer.closeIndex();

    // =========================================================
    // Now search
    // =========================================================
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));

    //===========================================================
    //  GET Term frequency
    //===========================================================
    // Creating a output file to store the term,term_frequency pairs.
    PrintWriter tfwriter = new PrintWriter("..\\term-frequency.csv");

    Fields fields = MultiFields.getFields(reader);
    HashMap<String, Long> tfmap = new HashMap<String, Long>();
    Terms terms = fields.terms("contents");
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef bref = null;
    while ((bref = termsEnum.next()) != null) {
        String term_name = new String(bref.bytes, bref.offset, bref.length);
        Term term_instance = new Term("contents", term_name);
        long termFrequency = reader.totalTermFreq(term_instance);
        tfmap.put(term_name, termFrequency);
    }
    System.out.println(tfmap.size());
    for (String key : tfmap.keySet()) {
        tfwriter.write(key + "," + tfmap.get(key));
        tfwriter.write("\n");
    }
    tfwriter.close();
    //====================================================================
    // Code END to fetch term frequency
    //====================================================================
    IndexSearcher searcher = new IndexSearcher(reader);
    s = "";
    while (!s.equalsIgnoreCase("q")) {
        TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
        try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            // 4. display results
            System.out.println("Found " + hits.length + " hits.");
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                System.out.println((i + 1) + ". " + d.get("filename") + " score=" + hits[i].score);
            }
            // 5. term stats --> watch out for which "version" of the term
            // must be checked here instead!
            Term termInstance = new Term("contents", s);
            long termFreq = reader.totalTermFreq(termInstance);
            long docCount = reader.docFreq(termInstance);
            System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount);
        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
            break;
        }
    }
}

From source file:lucenesearche.HW3.java

public static void main(String[] args) throws IOException {
    System.out.println(//from  w ww.j a va  2s . co m
            "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

    String indexLocation = null;
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    String s = br.readLine();

    HW3 indexer = null;
    try {
        indexLocation = s;
        indexer = new HW3(s);
    } catch (Exception ex) {
        System.out.println("Cannot create index..." + ex.getMessage());
        System.exit(-1);
    }

    // ===================================================
    // read input from user until he enters q for quit
    // ===================================================
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println(
                    "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            // try to add file into the index
            indexer.indexFileOrDirectory(s);
        } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
        }
    }

    // ===================================================
    // after adding, we always have to call the
    // closeIndex, otherwise the index is not created
    // ===================================================
    indexer.closeIndex();

    // =========================================================
    // Now search
    // =========================================================
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    IndexSearcher searcher = new IndexSearcher(reader);
    TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
    Formatter f = new Formatter();

    s = "";
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            // 4. display results
            String query1, query2, query3, query4;
            query1 = "Lucene_Q1_top100.txt";
            query2 = "Lucene_Q2_top100.txt";
            query3 = "Lucene_Q3_top100.txt";
            query4 = "Lucene_Q4_top100.txt";
            File luceneFile = new File(query4); // change filename for each query
            int query_id;
            query_id = 4; // change this for new query 
            luceneFile.createNewFile();
            FileWriter writer = new FileWriter(luceneFile);
            writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0", "Document Name",
                    "Rank", "Cosine Similarity Score", "System Name\n"));
            System.out.println("Found " + hits.length + " hits.");
            System.out.println(f.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0",
                    "Document Name", "Rank", "Cosine Similarity Score", "System Name"));
            for (int i = 0; i < hits.length; ++i) {
                Formatter fmt = new Formatter();
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                //System.out.println((i+1) +". " + d.get("path")+" "+ hits[i].score);
                writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0",
                        "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001\n"));
                writer.flush();
                System.out.println(fmt.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0",
                        "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001"));
            }
            writer.close();

            // 5. term stats --> watch out for which "version" of the term
            // must be checked here instead!
            Term termInstance = new Term("contents", s);
            long termFreq = reader.totalTermFreq(termInstance);
            long docCount = reader.docFreq(termInstance);
            System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount);

        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
            break;
        }

    }

}

From source file:magoffin.matt.lucene.LuceneSearchService.java

License:Open Source License

/**
 * Delete a Document from the index.//  w  w w  . j a  v a 2  s . com
 * 
 * <p>Check out <a 
 * href="http://nagoya.apache.org/eyebrowse/ReadMsg?listName=lucene-user@jakarta.apache.org&msgId=1190557"
 * >this post</a> for info on how this is done.
 * </p>
 * 
 * @param type the index type
 * @param reader the index to delete from
 * @param id the ID of the Document to delete, using the <code>idField</code> field
 * @return the number of items deleted
 */
protected int deleteFromIndex(String type, IndexReader reader, Object id) {
    if (id == null) {
        throw new IllegalArgumentException("Null ID passed to deleteFromIndex");
    }
    try {
        Term idTerm = new Term(idField, id.toString());
        if (reader.docFreq(idTerm) > 0) {
            int result = reader.deleteDocuments(idTerm);
            if (traceLog.isInfoEnabled()) {
                traceLog.info(TraceOp.DELETE + "Deleted " + result + " Document for ID " + id + " from reader "
                        + reader + " (" + reader.directory().toString() + ")");
            }
            LuceneServiceUtils.publishIndexEvent(new IndexEvent(id, EventType.DELETE, type),
                    this.indexEventListeners);
            return result;
        }
    } catch (IOException e) {
        throw new RuntimeException("IOException deleting Document from Lucene index", e);
    }
    return 0;
}

From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java

License:Apache License

/**
 * Generate a query that will produce "more documents like" the first
 * in the sub-query.//from  w w w. jav a2 s  .  c  o m
 */
public Query rewrite(IndexReader reader) throws IOException {
    // If field boosts were specified, make sure there are the same number of
    // boosts as there are fields.
    //
    if (fieldBoosts != null && fieldBoosts.length != fieldNames.length)
        throw new RuntimeException(
                "Error: different number of boosts than fields specified to MoreLikeThisQuery");

    // Determine the target document.
    IndexSearcher searcher = new IndexSearcher(reader);
    targetDoc = -1;
    HitCollector collector = new HitCollector() {
        public void collect(int doc, float score) {
            if (targetDoc < 0)
                targetDoc = doc;
        }
    };

    searcher.search(subQuery, collector);

    // If none, make a query that will definitely return nothing at all.
    if (targetDoc < 0)
        return new TermQuery(new Term("fribbleSnarf", "!*@&#(*&"));

    // Eliminate fields with zero boost. Along the way, make a boost map so we
    // have fast access to the boost per field.
    //
    String[] fields = this.fieldNames;
    if (fieldBoosts != null) {
        ArrayList filteredFields = new ArrayList();
        for (int i = 0; i < fieldNames.length; i++) {
            if (fieldBoosts[i] > 0.0f) {
                filteredFields.add(fieldNames[i]);
                boostMap.put(fieldNames[i], new Float(fieldBoosts[i]));
            }
        }
        fields = (String[]) filteredFields.toArray(new String[filteredFields.size()]);
    }

    // If we've been asked to calculate the max document frequency, do it now.
    if (maxDocFreq < 0) {
        int nDocs = reader.docFreq(new Term("docInfo", "1"));
        maxDocFreq = Math.max(5, nDocs / 20);
    }

    // Add facet fields, if any. For now, spot them by name.
    XTFTextAnalyzer analyzer = new XTFTextAnalyzer(null, pluralMap, accentMap);
    for (int i = 0; i < fields.length; i++) {
        if (fields[i].indexOf("facet") >= 0)
            analyzer.addFacetField(fields[i]);
    }

    // Determine which terms are "best" for querying.
    PriorityQueue bestTerms = retrieveTerms(reader, targetDoc, analyzer);

    // Make the "more like this" query from those terms.
    Query rawQuery = createQuery(reader, bestTerms);

    // Exclude the original document in the result set.
    Query ret = new MoreLikeWrapper(this, rawQuery);
    if (Trace.getOutputLevel() >= Trace.debug)
        Trace.debug("More-like query: " + ret);

    return ret;
}

From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java

License:Apache License

/**
 * Create the More like query from a PriorityQueue
 *///from   w  w w. j  a  v a 2 s  .  c  o m
private Query createQuery(IndexReader indexReader, PriorityQueue q) throws IOException {
    // Pop everything from the queue.
    QueryWord[] queryWords = new QueryWord[q.size()];
    for (int i = q.size() - 1; i >= 0; i--)
        queryWords[i] = (QueryWord) q.pop();

    BooleanQuery query = new BooleanQuery(true /*disable coord*/);

    // At the moment, there's no need to scale by the best score. It simply
    // clouds the query explanation. It doesn't affect the scores, since
    // Lucene applies a query normalization factor anyway.
    //
    //float bestScore = (queryWords.length > 0) ? queryWords[0].score : 0.0f;
    for (int i = 0; i < fieldNames.length; i++) {
        ArrayList fieldClauses = new ArrayList();

        for (int j = 0; j < queryWords.length; j++) {
            QueryWord qw = queryWords[j];
            Term term = new Term(fieldNames[i], qw.word);

            // Skip words not present in this field.
            int docFreq = indexReader.docFreq(term);
            if (docFreq == 0)
                continue;

            // Add it to the query.
            SpanTermQuery tq = new SpanTermQuery(term);
            if (boost)
                tq.setBoost(qw.score);
            fieldClauses.add(tq);
        } // for j

        // If no terms for this field, skip it.
        if (fieldClauses.isEmpty())
            continue;

        SpanQuery[] clauses = (SpanQuery[]) fieldClauses.toArray(new SpanQuery[fieldClauses.size()]);

        // Now make a special Or-Near query out of the clauses.
        SpanOrNearQuery fieldQuery = new SpanOrNearQuery(clauses, 10, false);

        // Boost if necessary.
        if (fieldBoosts != null)
            fieldQuery.setBoost(fieldBoosts[i]);

        // We currently don't support more-like-this queries on the full text.
        // It would involve de-chunking, and also fancier logic to pick the
        // "most interesting" terms in the first place.
        //
        if (fieldNames[i].equals("text"))
            throw new RuntimeException("MoreLikeThisQuery does not support 'text' field.");

        // And add to the main query.
        query.add(fieldQuery, BooleanClause.Occur.SHOULD);
    } // for i

    // All done.
    return query;
}

From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java

License:Apache License

/**
 * Condense the same term in multiple fields into a single term with a
 * total score./*from w ww . ja v  a2 s. c  o  m*/
 *
 * @param words a map of words keyed on the word(String) with Int objects as the values.
 */
private Map condenseTerms(IndexReader indexReader, Map words) throws IOException {
    HashMap termScoreMap = new HashMap();

    // For reference in score calculations, get the total # of docs in index
    int numDocs = indexReader.numDocs();

    // For each term...
    Iterator it = words.keySet().iterator();
    while (it.hasNext()) {
        Term term = (Term) it.next();

        // Filter out words that don't occur enough times in the source doc
        int tf = ((Int) words.get(term)).x;
        if (minTermFreq > 0 && tf < minTermFreq)
            continue;

        // Filter out words that don't occur in enough docs
        int docFreq = indexReader.docFreq(term);
        if (minDocFreq > 0 && docFreq < minDocFreq)
            continue;

        // Filter out words that occur in too many docs
        if (maxDocFreq > 0 && docFreq > maxDocFreq)
            continue;

        // Handle potential index update problem
        if (docFreq == 0)
            continue;

        // Calculate a score for this term.
        float idf = similarity.idf(docFreq, numDocs);
        float score = tf * idf;

        // Boost if necessary.
        Float found = (Float) boostMap.get(term.field());
        if (found != null)
            score *= found.floatValue();

        // Add the score to our map.
        String word = term.text();
        if (!termScoreMap.containsKey(word))
            termScoreMap.put(word, new Flt());
        Flt cnt = (Flt) termScoreMap.get(word);
        cnt.x += score;
    }

    return termScoreMap;
}

From source file:net.sf.jtmt.summarizers.LuceneSummarizer.java

License:Apache License

/**
 * Compute top term query./*from   www.  ja  v a  2  s  . co m*/
 *
 * @param ramdir the ramdir
 * @return the query
 * @throws Exception the exception
 */
private Query computeTopTermQuery(Directory ramdir) throws Exception {
    final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
    List<String> termlist = new ArrayList<String>();
    IndexReader reader = IndexReader.open(ramdir, true);
    TermEnum terms = reader.terms();
    while (terms.next()) {
        Term term = terms.term();
        String termText = term.text();
        int frequency = reader.docFreq(term);
        frequencyMap.put(termText, frequency);
        termlist.add(termText);
    }
    reader.close();
    // sort the term map by frequency descending
    Collections.sort(termlist,
            new ReverseComparator<String>(new ByValueComparator<String, Integer>(frequencyMap)));
    // retrieve the top terms based on topTermCutoff
    List<String> topTerms = new ArrayList<String>();
    float topFreq = -1.0F;
    for (String term : termlist) {
        if (topFreq < 0.0F) {
            // first term, capture the value
            topFreq = (float) frequencyMap.get(term);
            topTerms.add(term);
        } else {
            // not the first term, compute the ratio and discard if below
            // topTermCutoff score
            float ratio = (float) ((float) frequencyMap.get(term) / topFreq);
            if (ratio >= topTermCutoff) {
                topTerms.add(term);
            } else {
                break;
            }
        }
    }
    StringBuilder termBuf = new StringBuilder();
    BooleanQuery q = new BooleanQuery();
    for (String topTerm : topTerms) {
        termBuf.append(topTerm).append("(").append(frequencyMap.get(topTerm)).append(");");
        q.add(new TermQuery(new Term("text", topTerm)), Occur.SHOULD);
    }
    System.out.println(">>> top terms: " + termBuf.toString());
    System.out.println(">>> query: " + q.toString());
    return q;
}