Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

/**
 * Builds the TFIDF vector and its norm2
 * /*from  ww  w  .  j  av a  2s .  co m*/
 * @param tfidf
 *            - the vector containing for each term its TFIDF score, it will
 *            be populated by this method
 * @param freq
 *            - the vector containing for each term its frequency
 * @param field
 *            - the field on which to compute the inverse document frequency
 * 
 * @return the norm of the TFIDF vector
 * 
 */
private double tfidfVector(Map<String, Double> tfidf, Map<String, Integer> freq, String field) {
    IndexReader reader = getReader();

    double norm = 0;
    for (Map.Entry<String, Integer> entry : freq.entrySet()) {
        Term t = new Term(field, entry.getKey());
        int df = 0;
        try {
            df = reader.docFreq(t);
        } catch (IOException e) {
            logger.error("computing tfidfVector ({}) ", e.toString());
            System.exit(-1);
        }
        double idf = Math.log(collectionSize / (double) df + 1) / Math.log(2) + 1;
        double tfidfValue = entry.getValue() * idf;
        norm += tfidfValue * tfidfValue;
        tfidf.put(entry.getKey(), tfidfValue);
    }
    return Math.sqrt(norm);

}

From source file:it.unibz.instasearch.indexing.SearchResultDoc.java

License:Open Source License

private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException {
    if (vect == null)
        return new float[0];

    int[] termFrequencies = vect.getTermFrequencies();
    String[] terms = vect.getTerms();
    float[] scores = new float[terms.length];

    int numDocs = reader.maxDoc();
    Similarity sim = Searcher.SIMILARITY;

    for (int i = 0; i < terms.length; i++) {
        String termText = terms[i];
        Term term = new Term(Field.CONTENTS.toString(), termText);

        float termFreq = sim.tf(termFrequencies[i]);

        int docFreq = reader.docFreq(term);
        float idf = sim.idf(docFreq, numDocs);

        float tfIdf = termFreq * idf;

        scores[i] = tfIdf;//from  ww  w .  j a va 2s.  c  om
    }

    return scores;
}

From source file:it.unipd.dei.ims.falcon.indexing.Indexing.java

License:Apache License

/**
 * Indexes all the songs in the specified path.
 * The index is created in the specified directory "indexPath". If an index
 * already exists in that path, adds the songs to the existing index.
 * Each song is processed by the method/*  w  ww. j a v a  2 s  .  c o m*/
 * {@link it.unipd.dei.ims.falcon.indexing.Indexing#indexSong}
 * which maps the song into a set of segments, each of one is mapped in a
 * Lucene {@link org.apache.lucene.document.Document}.
 * The segments have fixed length, specifically are constituted by 
 * "hashPerSegment" hashes. There can be an overlap of "hashInOverlap"
 * hashes between two segments. The number of hash in the overlap must be
 * smaller than the number of hash per segments, otherwise an
 * {@link it.unipd.dei.ims.falcon.indexing.IndexingException} is thrown.
 * <p>
 * Once the index has been created or updated, writes a map into a file.
 * The map associates a set of features to each hash. Those features are
 * based on occurrence statistics of the hash in the entire collection.
 * In the event of an index update the map is re-built and the map file
 * is over-written.
 * @param data Input file. If it is a directory, index all files inside it.
 * @param index Falcon index.
 * @param hashPerSegment Number of hashes per segment.
 * @param hashInOverlap Number of overlapping hashes per segment.
 * @throws IndexingException 
 */
public static void index(File data, File index, final int hashPerSegment, final int hashInOverlap,
        final int subsampling, final int nranks, final double minkurtosis,
        final TranspositionEstimator transpEst, boolean verbose) throws IndexingException, IOException {

    long start_time = System.currentTimeMillis();

    if (hashPerSegment <= hashInOverlap)
        throw new IndexingException(
                "Number of hashes in the overlap cannot be equal to the number of hash per segment");

    if (!data.canRead())
        throw new IOException("cannot read input path");
    if (data.isDirectory()) {
        for (File f : data.listFiles())
            if (!f.canRead())
                throw new IOException("cannot read one or more input files");
    }

    if (!index.exists()) // if index is being created rather than updated
        index.mkdir();
    if (!index.canWrite())
        throw new IOException("cannot write to index directory");

    SimpleFSDirectory indexDir = new SimpleFSDirectory(index, null);

    // initialize Lucene Analyzer and IndexWriter
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    final IndexWriter writer = new IndexWriter(indexDir, analyzer, !IndexReader.indexExists(indexDir),
            IndexWriter.MaxFieldLength.UNLIMITED);
    writer.setSimilarity(new HashSimilarity());

    // transform chroma data into hashes and write into index
    File[] inputfiles = data.isDirectory() ? data.listFiles() : new File[] { data };
    int fileNo = 0;
    for (final File file : inputfiles) {
        // if the current considered files exists and is not hidden
        if (file.exists() && !file.getName().startsWith(".")) {
            if (verbose)
                System.out.println(String.format("%10.3f%% - indexing %s", fileNo * 100. / inputfiles.length,
                        file.getAbsolutePath()));
            final List<OutputStream> fout = new LinkedList<OutputStream>();
            fout.add(new PipedOutputStream());
            final PipedInputStream fin = new PipedInputStream((PipedOutputStream) fout.get(0));
            Thread t = new Thread(new Runnable() {
                public void run() {
                    try {
                        ChromaMatrixUtils.convertChromaStreamIntoHashesStream(new FileReader(file), fout,
                                nranks, transpEst, minkurtosis, subsampling);
                    } catch (IOException ex) {
                        // TODO do something better for this exception ... (might hang all ...)
                        Logger.getLogger(Indexing.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            });
            t.start();
            indexSong(writer, fin, hashPerSegment, hashInOverlap, file.getAbsolutePath(),
                    file.getAbsolutePath());
            fileNo++;
        }
    }
    writer.optimize();
    writer.close();

    // additional falcon features
    PrintWriter pw = new PrintWriter(index.getAbsolutePath() + "/qpruning_features.map");
    IndexReader reader = IndexReader.open(new SimpleFSDirectory(index));
    int numSegments = reader.numDocs();
    long total_hcf = numSegments * hashPerSegment; // total number of hashes in the collection
    TermEnum hashes = reader.terms(); // distinct hashes in the collection

    while (hashes.next()) {
        if (!hashes.term().field().equals("CONTENT")) {
            continue;
        }
        Term curHash = hashes.term();
        pw.print(curHash.text() + "\t");
        pw.print((double) reader.docFreq(curHash) / numSegments + "\t"); // normalized document frequency
        TermDocs curHash_pl = reader.termDocs(curHash); // posting list for the current hash
        // computation of the frequency of the current hash in the
        // entire collection -- value initialization
        long hcf = 0;
        // initializes the normalized maximum frequency value
        double nmf = 0;
        // initializes the normalized frequency for max computation
        double cur_nf = 0;
        // processes posting list entries
        while (curHash_pl.next()) {
            // computation of the normalized frequency for
            // the current hash
            cur_nf = (double) curHash_pl.freq() / hashPerSegment;
            // update max if necessary
            if (cur_nf > nmf) {
                nmf = cur_nf;
            }
            hcf += curHash_pl.freq();
        }
        // prints normalized total collection frequency and
        // normalized maximum frequency for the current hash
        pw.print((double) hcf / total_hcf + "\t" + nmf + "\n");
    }
    pw.flush();
    pw.close();

    long end_time = System.currentTimeMillis();
    if (verbose)
        System.out.println(String.format("[INDEXING] - elapsed time: %10.3f", (end_time - start_time) / 1000.));

}

From source file:lucene.searchengine.LuceneSearchEngine.java

public static void main(String[] args) throws IOException {
    System.out.println(/*from w w w. j  a va  2  s . c o  m*/
            "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

    String indexLocation = null;
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    String s = br.readLine();

    LuceneSearchEngine indexer = null;
    try {
        indexLocation = s;
        indexer = new LuceneSearchEngine(s);
    } catch (Exception ex) {
        System.out.println("Cannot create index..." + ex.getMessage());
        System.exit(-1);
    }

    // ===================================================
    // read input from user until he enters q for quit
    // ===================================================
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println(
                    "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            // try to add file into the index
            indexer.indexFileOrDirectory(s);
        } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
        }
    }

    // ===================================================
    // after adding, we always have to call the
    // closeIndex, otherwise the index is not created
    // ===================================================
    indexer.closeIndex();

    // =========================================================
    // Now search
    // =========================================================
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));

    //===========================================================
    //  GET Term frequency
    //===========================================================
    // Creating a output file to store the term,term_frequency pairs.
    PrintWriter tfwriter = new PrintWriter("..\\term-frequency.csv");

    Fields fields = MultiFields.getFields(reader);
    HashMap<String, Long> tfmap = new HashMap<String, Long>();
    Terms terms = fields.terms("contents");
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef bref = null;
    while ((bref = termsEnum.next()) != null) {
        String term_name = new String(bref.bytes, bref.offset, bref.length);
        Term term_instance = new Term("contents", term_name);
        long termFrequency = reader.totalTermFreq(term_instance);
        tfmap.put(term_name, termFrequency);
    }
    System.out.println(tfmap.size());
    for (String key : tfmap.keySet()) {
        tfwriter.write(key + "," + tfmap.get(key));
        tfwriter.write("\n");
    }
    tfwriter.close();
    //====================================================================
    // Code END to fetch term frequency
    //====================================================================
    IndexSearcher searcher = new IndexSearcher(reader);
    s = "";
    while (!s.equalsIgnoreCase("q")) {
        TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
        try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            // 4. display results
            System.out.println("Found " + hits.length + " hits.");
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                System.out.println((i + 1) + ". " + d.get("filename") + " score=" + hits[i].score);
            }
            // 5. term stats --> watch out for which "version" of the term
            // must be checked here instead!
            Term termInstance = new Term("contents", s);
            long termFreq = reader.totalTermFreq(termInstance);
            long docCount = reader.docFreq(termInstance);
            System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount);
        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
            break;
        }
    }
}

From source file:lucenesearche.HW3.java

public static void main(String[] args) throws IOException {
    System.out.println(//from  w ww.j a va  2s . co m
            "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

    String indexLocation = null;
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    String s = br.readLine();

    HW3 indexer = null;
    try {
        indexLocation = s;
        indexer = new HW3(s);
    } catch (Exception ex) {
        System.out.println("Cannot create index..." + ex.getMessage());
        System.exit(-1);
    }

    // ===================================================
    // read input from user until he enters q for quit
    // ===================================================
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println(
                    "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            // try to add file into the index
            indexer.indexFileOrDirectory(s);
        } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
        }
    }

    // ===================================================
    // after adding, we always have to call the
    // closeIndex, otherwise the index is not created
    // ===================================================
    indexer.closeIndex();

    // =========================================================
    // Now search
    // =========================================================
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    IndexSearcher searcher = new IndexSearcher(reader);
    TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
    Formatter f = new Formatter();

    s = "";
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            // 4. display results
            String query1, query2, query3, query4;
            query1 = "Lucene_Q1_top100.txt";
            query2 = "Lucene_Q2_top100.txt";
            query3 = "Lucene_Q3_top100.txt";
            query4 = "Lucene_Q4_top100.txt";
            File luceneFile = new File(query4); // change filename for each query
            int query_id;
            query_id = 4; // change this for new query 
            luceneFile.createNewFile();
            FileWriter writer = new FileWriter(luceneFile);
            writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0", "Document Name",
                    "Rank", "Cosine Similarity Score", "System Name\n"));
            System.out.println("Found " + hits.length + " hits.");
            System.out.println(f.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0",
                    "Document Name", "Rank", "Cosine Similarity Score", "System Name"));
            for (int i = 0; i < hits.length; ++i) {
                Formatter fmt = new Formatter();
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                //System.out.println((i+1) +". " + d.get("path")+" "+ hits[i].score);
                writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0",
                        "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001\n"));
                writer.flush();
                System.out.println(fmt.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0",
                        "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001"));
            }
            writer.close();

            // 5. term stats --> watch out for which "version" of the term
            // must be checked here instead!
            Term termInstance = new Term("contents", s);
            long termFreq = reader.totalTermFreq(termInstance);
            long docCount = reader.docFreq(termInstance);
            System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount);

        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
            break;
        }

    }

}

From source file:magoffin.matt.lucene.LuceneSearchService.java

License:Open Source License

/**
 * Delete a Document from the index.//  w  w w  . j a  v a 2  s . com
 * 
 * <p>Check out <a 
 * href="http://nagoya.apache.org/eyebrowse/ReadMsg?listName=lucene-user@jakarta.apache.org&msgId=1190557"
 * >this post</a> for info on how this is done.
 * </p>
 * 
 * @param type the index type
 * @param reader the index to delete from
 * @param id the ID of the Document to delete, using the <code>idField</code> field
 * @return the number of items deleted
 */
protected int deleteFromIndex(String type, IndexReader reader, Object id) {
    if (id == null) {
        throw new IllegalArgumentException("Null ID passed to deleteFromIndex");
    }
    try {
        Term idTerm = new Term(idField, id.toString());
        if (reader.docFreq(idTerm) > 0) {
            int result = reader.deleteDocuments(idTerm);
            if (traceLog.isInfoEnabled()) {
                traceLog.info(TraceOp.DELETE + "Deleted " + result + " Document for ID " + id + " from reader "
                        + reader + " (" + reader.directory().toString() + ")");
            }
            LuceneServiceUtils.publishIndexEvent(new IndexEvent(id, EventType.DELETE, type),
                    this.indexEventListeners);
            return result;
        }
    } catch (IOException e) {
        throw new RuntimeException("IOException deleting Document from Lucene index", e);
    }
    return 0;
}

From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java

License:Apache License

/**
 * Generate a query that will produce "more documents like" the first
 * in the sub-query.//from  w w w. jav a2 s  .  c  o m
 */
public Query rewrite(IndexReader reader) throws IOException {
    // If field boosts were specified, make sure there are the same number of
    // boosts as there are fields.
    //
    if (fieldBoosts != null && fieldBoosts.length != fieldNames.length)
        throw new RuntimeException(
                "Error: different number of boosts than fields specified to MoreLikeThisQuery");

    // Determine the target document.
    IndexSearcher searcher = new IndexSearcher(reader);
    targetDoc = -1;
    HitCollector collector = new HitCollector() {
        public void collect(int doc, float score) {
            if (targetDoc < 0)
                targetDoc = doc;
        }
    };

    searcher.search(subQuery, collector);

    // If none, make a query that will definitely return nothing at all.
    if (targetDoc < 0)
        return new TermQuery(new Term("fribbleSnarf", "!*@&#(*&"));

    // Eliminate fields with zero boost. Along the way, make a boost map so we
    // have fast access to the boost per field.
    //
    String[] fields = this.fieldNames;
    if (fieldBoosts != null) {
        ArrayList filteredFields = new ArrayList();
        for (int i = 0; i < fieldNames.length; i++) {
            if (fieldBoosts[i] > 0.0f) {
                filteredFields.add(fieldNames[i]);
                boostMap.put(fieldNames[i], new Float(fieldBoosts[i]));
            }
        }
        fields = (String[]) filteredFields.toArray(new String[filteredFields.size()]);
    }

    // If we've been asked to calculate the max document frequency, do it now.
    if (maxDocFreq < 0) {
        int nDocs = reader.docFreq(new Term("docInfo", "1"));
        maxDocFreq = Math.max(5, nDocs / 20);
    }

    // Add facet fields, if any. For now, spot them by name.
    XTFTextAnalyzer analyzer = new XTFTextAnalyzer(null, pluralMap, accentMap);
    for (int i = 0; i < fields.length; i++) {
        if (fields[i].indexOf("facet") >= 0)
            analyzer.addFacetField(fields[i]);
    }

    // Determine which terms are "best" for querying.
    PriorityQueue bestTerms = retrieveTerms(reader, targetDoc, analyzer);

    // Make the "more like this" query from those terms.
    Query rawQuery = createQuery(reader, bestTerms);

    // Exclude the original document in the result set.
    Query ret = new MoreLikeWrapper(this, rawQuery);
    if (Trace.getOutputLevel() >= Trace.debug)
        Trace.debug("More-like query: " + ret);

    return ret;
}

From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java

License:Apache License

/**
 * Create the More like query from a PriorityQueue
 *///from   w  w w. j  a  v a 2 s  .  c  o m
private Query createQuery(IndexReader indexReader, PriorityQueue q) throws IOException {
    // Pop everything from the queue.
    QueryWord[] queryWords = new QueryWord[q.size()];
    for (int i = q.size() - 1; i >= 0; i--)
        queryWords[i] = (QueryWord) q.pop();

    BooleanQuery query = new BooleanQuery(true /*disable coord*/);

    // At the moment, there's no need to scale by the best score. It simply
    // clouds the query explanation. It doesn't affect the scores, since
    // Lucene applies a query normalization factor anyway.
    //
    //float bestScore = (queryWords.length > 0) ? queryWords[0].score : 0.0f;
    for (int i = 0; i < fieldNames.length; i++) {
        ArrayList fieldClauses = new ArrayList();

        for (int j = 0; j < queryWords.length; j++) {
            QueryWord qw = queryWords[j];
            Term term = new Term(fieldNames[i], qw.word);

            // Skip words not present in this field.
            int docFreq = indexReader.docFreq(term);
            if (docFreq == 0)
                continue;

            // Add it to the query.
            SpanTermQuery tq = new SpanTermQuery(term);
            if (boost)
                tq.setBoost(qw.score);
            fieldClauses.add(tq);
        } // for j

        // If no terms for this field, skip it.
        if (fieldClauses.isEmpty())
            continue;

        SpanQuery[] clauses = (SpanQuery[]) fieldClauses.toArray(new SpanQuery[fieldClauses.size()]);

        // Now make a special Or-Near query out of the clauses.
        SpanOrNearQuery fieldQuery = new SpanOrNearQuery(clauses, 10, false);

        // Boost if necessary.
        if (fieldBoosts != null)
            fieldQuery.setBoost(fieldBoosts[i]);

        // We currently don't support more-like-this queries on the full text.
        // It would involve de-chunking, and also fancier logic to pick the
        // "most interesting" terms in the first place.
        //
        if (fieldNames[i].equals("text"))
            throw new RuntimeException("MoreLikeThisQuery does not support 'text' field.");

        // And add to the main query.
        query.add(fieldQuery, BooleanClause.Occur.SHOULD);
    } // for i

    // All done.
    return query;
}

From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java

License:Apache License

/**
 * Condense the same term in multiple fields into a single term with a
 * total score./*from w ww . ja v  a2 s. c  o  m*/
 *
 * @param words a map of words keyed on the word(String) with Int objects as the values.
 */
private Map condenseTerms(IndexReader indexReader, Map words) throws IOException {
    HashMap termScoreMap = new HashMap();

    // For reference in score calculations, get the total # of docs in index
    int numDocs = indexReader.numDocs();

    // For each term...
    Iterator it = words.keySet().iterator();
    while (it.hasNext()) {
        Term term = (Term) it.next();

        // Filter out words that don't occur enough times in the source doc
        int tf = ((Int) words.get(term)).x;
        if (minTermFreq > 0 && tf < minTermFreq)
            continue;

        // Filter out words that don't occur in enough docs
        int docFreq = indexReader.docFreq(term);
        if (minDocFreq > 0 && docFreq < minDocFreq)
            continue;

        // Filter out words that occur in too many docs
        if (maxDocFreq > 0 && docFreq > maxDocFreq)
            continue;

        // Handle potential index update problem
        if (docFreq == 0)
            continue;

        // Calculate a score for this term.
        float idf = similarity.idf(docFreq, numDocs);
        float score = tf * idf;

        // Boost if necessary.
        Float found = (Float) boostMap.get(term.field());
        if (found != null)
            score *= found.floatValue();

        // Add the score to our map.
        String word = term.text();
        if (!termScoreMap.containsKey(word))
            termScoreMap.put(word, new Flt());
        Flt cnt = (Flt) termScoreMap.get(word);
        cnt.x += score;
    }

    return termScoreMap;
}

From source file:net.sf.jtmt.summarizers.LuceneSummarizer.java

License:Apache License

/**
 * Compute top term query./*from   www.  ja  v a  2  s  . co m*/
 *
 * @param ramdir the ramdir
 * @return the query
 * @throws Exception the exception
 */
private Query computeTopTermQuery(Directory ramdir) throws Exception {
    final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
    List<String> termlist = new ArrayList<String>();
    IndexReader reader = IndexReader.open(ramdir, true);
    TermEnum terms = reader.terms();
    while (terms.next()) {
        Term term = terms.term();
        String termText = term.text();
        int frequency = reader.docFreq(term);
        frequencyMap.put(termText, frequency);
        termlist.add(termText);
    }
    reader.close();
    // sort the term map by frequency descending
    Collections.sort(termlist,
            new ReverseComparator<String>(new ByValueComparator<String, Integer>(frequencyMap)));
    // retrieve the top terms based on topTermCutoff
    List<String> topTerms = new ArrayList<String>();
    float topFreq = -1.0F;
    for (String term : termlist) {
        if (topFreq < 0.0F) {
            // first term, capture the value
            topFreq = (float) frequencyMap.get(term);
            topTerms.add(term);
        } else {
            // not the first term, compute the ratio and discard if below
            // topTermCutoff score
            float ratio = (float) ((float) frequencyMap.get(term) / topFreq);
            if (ratio >= topTermCutoff) {
                topTerms.add(term);
            } else {
                break;
            }
        }
    }
    StringBuilder termBuf = new StringBuilder();
    BooleanQuery q = new BooleanQuery();
    for (String topTerm : topTerms) {
        termBuf.append(topTerm).append("(").append(frequencyMap.get(topTerm)).append(");");
        q.add(new TermQuery(new Term("text", topTerm)), Occur.SHOULD);
    }
    System.out.println(">>> top terms: " + termBuf.toString());
    System.out.println(">>> query: " + q.toString());
    return q;
}