Example usage for org.apache.lucene.index TermsEnum totalTermFreq

List of usage examples for org.apache.lucene.index TermsEnum totalTermFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index TermsEnum totalTermFreq.

Prototype

public abstract long totalTermFreq() throws IOException;

Source Link

Document

Returns the total number of occurrences of this term across all documents (the sum of the freq() for each doc that has this term).

Usage

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 *//*from  w w  w . ja  v  a  2  s .  c  o m*/
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        final int freq = (int) termsEnum.totalTermFreq();

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Print a term vector for debugging//from w  w w .  jav  a  2s.  c om
 * 
 * @param vector List of terms and their frequencies for a doc/field
 * @throws IOException 
 */
@SuppressWarnings("unused")
private void print(Terms vector) throws IOException {
    if (vector == null)
        return;
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    // termsEnum.docFreq() = 1, 
    // The returned Fields instance acts like a single-document inverted index
    HashMap<String, Long> map = new HashMap<String, Long>();
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        map.put(spare.toString(), termsEnum.totalTermFreq());
    }
    @SuppressWarnings("unchecked")
    Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]);
    Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() {
        public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
            return o2.getValue().compareTo(o1.getValue());
        }
    });
    for (Map.Entry<String, Long> e : a) {
        System.out.print(e.getKey() + ":" + e.getValue() + " ");
    }
    System.out.println();
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap,
        String id, String field, double weight) throws IOException {
    Query query = new TermQuery(new Term("id", id));
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;/*w w  w .ja  va 2  s  .  c  o m*/
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                double minFreq = reader.numDocs() * 0.0001;
                double maxFreq = reader.numDocs() / 3;
                //double minFreq = 0;
                //double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id,
        String field, double weight) throws IOException {
    Query query = NumericRangeQuery.newLongRange("id", id, id, true, true);
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;/*from w  w  w  .ja  v a  2 s . co m*/
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                //double minFreq = reader.numDocs() * 0.0001;
                //double maxFreq = reader.numDocs() / 3;
                double minFreq = 0;
                double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION));

    CommandLine cmdline = null;/*from  www . j a  v  a  2  s  .  c o m*/
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexLocation = cmdline.getOptionValue(INDEX_OPTION);
    int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1;

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name);
    TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);

    long missingCnt = 0;
    int skippedTerms = 0;
    BytesRef bytes = new BytesRef();
    while ((bytes = termsEnum.next()) != null) {
        byte[] buf = new byte[bytes.length];
        System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length);
        String term = new String(buf, "UTF-8");
        int df = termsEnum.docFreq();
        long cf = termsEnum.totalTermFreq();

        if (df < min) {
            skippedTerms++;
            missingCnt += cf;
            continue;
        }

        out.println(term + "\t" + df + "\t" + cf);
    }

    reader.close();
    out.close();
    System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt);
}

From source file:ci6226.facetsearch.java

public static void main(String[] args) throws Exception {
    String index = "./myindex";
    String field = "text";
    String queries = null;/*from  w  ww  .  jav a  2 s .com*/
    int hitsPerPage = 10;
    boolean raw = false;

    //http://lucene.apache.org/core/4_0_0/facet/org/apache/lucene/facet/doc-files/userguide.html#facet_accumulation

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    // :Post-Release-Update-Version.LUCENE_XY:

    //TODO: SAME AS HOW U BUILD INDEX
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    // :Post-Release-Update-Version.LUCENE_XY:
    QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer);
    while (true) {

        System.out.println("Enter query: ");
        String line = in.readLine();
        line = line.trim();
        if (line.length() == 0) {
            break;
        }
        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));
        Date start = new Date();
        searcher.search(query, null, 100);
        Date end = new Date();
        System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        TopDocs results = searcher.search(query, 5 * hitsPerPage);
        ScoreDoc[] hits = results.scoreDocs;
        int numTotalHits = results.totalHits;

        //N= max docs
        //df = totoal matched doc
        //idf=log(N/df)

        for (int i = 0; i < hits.length; i++) {
            Document doc = searcher.doc(hits[i].doc);
            System.out.println(ANSI_BLUE + (i + 1) + ANSI_RESET + "\nScore=\t" + hits[i].score);
            String rtext = doc.get(field);
            System.out.println("Text=\t" + rtext);

            Terms vector = reader.getTermVector(i, "text");
            if (vector == null)
                continue;
            // System.out.println(vector.getSumDocFreq());

            // Terms vector = reader.getTermVector(hits[i].doc, field);  //hits[i].doc=docID
            TermsEnum termsEnum = vector.iterator(null);
            termsEnum = vector.iterator(termsEnum);
            Map<String, Integer> frequencies = new HashMap<>();
            BytesRef text = null;
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int freq = (int) termsEnum.totalTermFreq();
                frequencies.put(term, freq);
                // System.out.println("Time: "+term + " idef "+freq);
            }

        }

        //   String[] facetCatlog={""};

        System.out.println(numTotalHits + " total matching documents");

    }

    reader.close();
}

From source file:com.github.flaxsearch.resources.PostingsResource.java

License:Apache License

@GET
public TermData getPostings(@QueryParam("segment") Integer segment, @PathParam("field") String field,
        @PathParam("term") String term, @QueryParam("count") @DefaultValue("2147483647") int count)
        throws IOException {

    TermsEnum te = readerManager.findTermPostings(segment, field, term);
    Bits liveDocs = readerManager.getLiveDocs(segment);
    PostingsEnum pe = te.postings(null, PostingsEnum.NONE);

    int docFreq = te.docFreq();
    long totalTermFreq = te.totalTermFreq();

    int size = (docFreq < count) ? docFreq : count;
    int[] postings = new int[size];
    int docId;/*ww  w. j ava 2 s .  c om*/
    int i = 0;
    while ((docId = pe.nextDoc()) != PostingsEnum.NO_MORE_DOCS && i < count) {
        if (liveDocs != null && liveDocs.get(docId) == false)
            continue;
        postings[i] = docId;
        i++;
    }
    return new TermData(term, docFreq, totalTermFreq, postings);
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

/**
 * {@inheritDoc}/*from  w ww  . j a  va 2  s .  c  om*/
 */
@Override
public void train(LeafReader leafReader, String textFieldName, String classFieldName, Analyzer analyzer,
        Query query) throws IOException {
    this.textTerms = MultiFields.getTerms(leafReader, textFieldName);

    if (textTerms == null) {
        throw new IOException("term vectors need to be available for field " + textFieldName);
    }

    this.analyzer = analyzer;
    this.textFieldName = textFieldName;

    if (threshold == null || threshold == 0d) {
        // automatic assign a threshold
        long sumDocFreq = leafReader.getSumDocFreq(textFieldName);
        if (sumDocFreq != -1) {
            this.threshold = (double) sumDocFreq / 2d;
        } else {
            throw new IOException("threshold cannot be assigned since term vectors for field " + textFieldName
                    + " do not exist");
        }
    }

    // TODO : remove this map as soon as we have a writable FST
    SortedMap<String, Double> weights = new TreeMap<>();

    TermsEnum termsEnum = textTerms.iterator();
    BytesRef textTerm;
    while ((textTerm = termsEnum.next()) != null) {
        weights.put(textTerm.utf8ToString(), (double) termsEnum.totalTermFreq());
    }
    updateFST(weights);

    IndexSearcher indexSearcher = new IndexSearcher(leafReader);

    int batchCount = 0;

    BooleanQuery q = new BooleanQuery();
    q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, "*")), BooleanClause.Occur.MUST));
    if (query != null) {
        q.add(new BooleanClause(query, BooleanClause.Occur.MUST));
    }
    // run the search and use stored field values
    for (ScoreDoc scoreDoc : indexSearcher.search(q, Integer.MAX_VALUE).scoreDocs) {
        Document doc = indexSearcher.doc(scoreDoc.doc);

        IndexableField textField = doc.getField(textFieldName);

        // get the expected result
        IndexableField classField = doc.getField(classFieldName);

        if (textField != null && classField != null) {
            // assign class to the doc
            ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue());
            Boolean assignedClass = classificationResult.getAssignedClass();

            Boolean correctClass = Boolean.valueOf(classField.stringValue());
            long modifier = correctClass.compareTo(assignedClass);
            if (modifier != 0) {
                updateWeights(leafReader, scoreDoc.doc, assignedClass, weights, modifier,
                        batchCount % batchSize == 0);
            }
            batchCount++;
        }
    }
    weights.clear(); // free memory while waiting for GC
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateWeights(LeafReader leafReader, int docId, Boolean assignedClass,
        SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException {
    TermsEnum cte = textTerms.iterator();

    // get the doc term vectors
    Terms terms = leafReader.getTermVector(docId, textFieldName);

    if (terms == null) {
        throw new IOException("term vectors must be stored for field " + textFieldName);
    }/* w  w  w . j  a v a2 s.c o m*/

    TermsEnum termsEnum = terms.iterator();

    BytesRef term;

    while ((term = termsEnum.next()) != null) {
        cte.seekExact(term);
        if (assignedClass != null) {
            long termFreqLocal = termsEnum.totalTermFreq();
            // update weights
            Long previousValue = Util.get(fst, term);
            String termString = term.utf8ToString();
            weights.put(termString, previousValue + modifier * termFreqLocal);
        }
    }
    if (updateFST) {
        updateFST(weights);
    }
}

From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtils.java

License:Apache License

/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 * @param docTerms term vectors for a given document
 * @param fieldTerms field term vectors/*from  w  w  w.j  ava 2  s. c o  m*/
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
    TermsEnum fieldTermsEnum = fieldTerms.iterator();
    Double[] freqVector = null;
    if (docTerms != null && fieldTerms.size() > -1) {
        freqVector = new Double[(int) fieldTerms.size()];
        int i = 0;
        TermsEnum docTermsEnum = docTerms.iterator();
        BytesRef term;
        while ((term = fieldTermsEnum.next()) != null) {
            TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
            if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
                docTermsEnum = docTerms.iterator();
            }
            if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
                long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
                freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
            } else {
                freqVector[i] = 0d;
            }
            i++;
        }
    }
    return freqVector;
}