Example usage for org.apache.lucene.index Terms getDocCount

List of usage examples for org.apache.lucene.index Terms getDocCount

Introduction

In this page you can find the example usage for org.apache.lucene.index Terms getDocCount.

Prototype

public abstract int getDocCount() throws IOException;

Source Link

Document

Returns the number of documents that have at least one term for this field.

Usage

From source file:SimpleNaiveBayesClassifier.java

License:Apache License

/**
 * Returns the average number of unique terms times the number of docs belonging to the input class
 * @param term the term representing the class
 * @return the average number of unique terms
 * @throws IOException if a low level I/O problem happens
 *//*  ww  w  .ja  v  a  2 s . co m*/
private double getTextTermFreqForClass(Term term) throws IOException {
    double avgNumberOfUniqueTerms = 0;
    for (String textFieldName : textFieldNames) {
        Terms terms = MultiFields.getTerms(leafReader, textFieldName);
        long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
        avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
    }
    int docsWithC = leafReader.docFreq(term);
    return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c
}

From source file:SimpleNaiveBayesDocumentClassifier.java

License:Apache License

/**
 * Returns the average number of unique terms times the number of docs belonging to the input class
 *
 * @param  term the class term/*from   w  w w .j a v a 2 s .c  o m*/
 * @return the average number of unique terms
 * @throws java.io.IOException If there is a low-level I/O error
 */
private double getTextTermFreqForClass(Term term, String fieldName) throws IOException {
    double avgNumberOfUniqueTerms;
    Terms terms = MultiFields.getTerms(leafReader, fieldName);
    long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
    avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
    int docsWithC = leafReader.docFreq(term);
    return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c
}

From source file:com.basistech.lucene.tools.LuceneQueryTool.java

License:Apache License

private void countFields() throws IOException {
    for (String field : allFieldNames) {
        List<LeafReaderContext> leaves = indexReader.leaves();
        Map<String, Integer> fieldCounts = new TreeMap<>();
        int count = 0;
        for (LeafReaderContext leaf : leaves) {
            Terms terms = leaf.reader().terms(field);
            if (terms == null) {
                continue;
            }/*  w w  w  . j a  va  2s.  c  o m*/
            count += terms.getDocCount();
        }
        fieldCounts.put(field, count);
        for (Map.Entry<String, Integer> entry : fieldCounts.entrySet()) {
            defaultOut.println(entry.getKey() + ": " + entry.getValue());
        }
    }
}

From source file:com.github.flaxsearch.api.TermsData.java

License:Apache License

public TermsData(Terms terms, List<String> termsList, String encoding) throws IOException {
    this.termCount = terms.size();
    this.docCount = terms.getDocCount();
    this.minTerm = BytesRefUtils.encode(terms.getMin(), encoding);
    this.maxTerm = BytesRefUtils.encode(terms.getMax(), encoding);
    this.terms = termsList;
}

From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java

License:Apache License

/**
 * This function is building the frame of the cache. The cache is storing the
 * word occurrences to the memory after those searched once. This cache can
 * made 2-100x speedup in proper use, but can eat lot of memory. There is an
 * option to lower the memory consume, if a word have really low occurrence in
 * the index you could filter it out. The other parameter is switching between
 * the term searching, if it true, just the terms in the skeleton will be
 * searched, but if it false the terms whoes not in the cache will be searched
 * out too (but not cached).//  w w  w . j ava  2  s  . c  o m
 *
 * @param minTermOccurrenceInCache Lower cache size with higher value.
 * @param justCachedTerms          The switch for fully exclude low occurrence docs.
 * @throws IOException If there is a low-level I/O error.
 */
public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException {
    this.justCachedTerms = justCachedTerms;

    this.docsWithClassSize = countDocsWithClass();
    termCClassHitCache.clear();
    cclasses.clear();
    classTermFreq.clear();

    // build the cache for the word
    Map<String, Long> frequencyMap = new HashMap<>();
    for (String textFieldName : textFieldNames) {
        TermsEnum termsEnum = leafReader.terms(textFieldName).iterator();
        while (termsEnum.next() != null) {
            BytesRef term = termsEnum.term();
            String termText = term.utf8ToString();
            long frequency = termsEnum.docFreq();
            Long lastfreq = frequencyMap.get(termText);
            if (lastfreq != null)
                frequency += lastfreq;
            frequencyMap.put(termText, frequency);
        }
    }
    for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) {
        if (entry.getValue() > minTermOccurrenceInCache) {
            termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>());
        }
    }

    // fill the class list
    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    while ((termsEnum.next()) != null) {
        cclasses.add(BytesRef.deepCopyOf(termsEnum.term()));
    }
    // fill the classTermFreq map
    for (BytesRef cclass : cclasses) {
        double avgNumberOfUniqueTerms = 0;
        for (String textFieldName : textFieldNames) {
            terms = MultiFields.getTerms(leafReader, textFieldName);
            long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
            avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount();
        }
        int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass));
        classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC);
    }
}

From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java

License:Apache License

private double getTextTermFreqForClass(BytesRef c) throws IOException {
    double avgNumberOfUniqueTerms = 0;
    for (String textFieldName : textFieldNames) {
        Terms terms = MultiFields.getTerms(leafReader, textFieldName);
        long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
        avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
    }//from   w ww  .ja v  a2 s  .  c o  m
    int docsWithC = leafReader.docFreq(new Term(classFieldName, c));
    return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c
}

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

/**
 * checks collection-level statistics on Terms
 *//*from   w ww . java 2 s .  c om*/
public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception {
    if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) {
        assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount());
    }
    if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) {
        assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq());
    }
    if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) {
        assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq());
    }
    if (leftTerms.size() != -1 && rightTerms.size() != -1) {
        assertEquals(leftTerms.size(), rightTerms.size());
    }
}

From source file:de.unihildesheim.iw.lucene.search.EmptyFieldFilter.java

License:Open Source License

@Override
public DocIdSet getDocIdSet(@NotNull final LeafReaderContext context, @Nullable final Bits acceptDocs)
        throws IOException {
    FixedBitSet checkBits;//from ww w  .j a v  a  2  s . c o  m
    final LeafReader reader = context.reader();
    final int maxDoc = reader.maxDoc();

    BitSet finalBits = new SparseFixedBitSet(maxDoc);
    if (acceptDocs == null) {
        checkBits = BitsUtils.bits2FixedBitSet(reader.getLiveDocs());
        if (checkBits == null) {
            // all live
            checkBits = new FixedBitSet(maxDoc);
            checkBits.set(0, checkBits.length());
        }
    } else {
        checkBits = BitsUtils.bits2FixedBitSet(acceptDocs);
    }

    @Nullable
    final Terms terms = reader.terms(this.field);
    if (terms != null) {
        final int termsDocCount = terms.getDocCount();

        if (termsDocCount != 0) {
            if (termsDocCount == maxDoc) {
                // all matching
                finalBits = checkBits;
            } else {
                @Nullable
                final Terms t = reader.terms(this.field);
                if (t != null) {
                    PostingsEnum pe = null;
                    final TermsEnum te = t.iterator(null);
                    int docId;
                    while (te.next() != null) {
                        pe = te.postings(checkBits, pe, (int) PostingsEnum.NONE);
                        while ((docId = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                            if (checkBits.getAndClear(docId)) {
                                finalBits.set(docId);
                            }
                        }
                    }
                }
            }
        }
    }
    return new BitDocIdSet(finalBits);
}

From source file:org.apache.solr.handler.admin.LukeRequestHandler.java

License:Apache License

private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req) throws Exception {

    SolrIndexSearcher searcher = req.getSearcher();
    SolrParams params = req.getParams();

    Set<String> fields = null;
    String fl = params.get(CommonParams.FL);
    if (fl != null) {
        fields = new TreeSet<String>(Arrays.asList(fl.split("[,\\s]+")));
    }/*from w  w  w .  j a v  a 2  s  . com*/

    AtomicReader reader = searcher.getAtomicReader();
    IndexSchema schema = searcher.getSchema();

    // Don't be tempted to put this in the loop below, the whole point here is to alphabetize the fields!
    Set<String> fieldNames = new TreeSet<String>();
    for (FieldInfo fieldInfo : reader.getFieldInfos()) {
        fieldNames.add(fieldInfo.name);
    }

    // Walk the term enum and keep a priority queue for each map in our set
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();

    for (String fieldName : fieldNames) {
        if (fields != null && !fields.contains(fieldName) && !fields.contains("*")) {
            continue; //we're not interested in this field Still an issue here
        }

        SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<Object>();

        SchemaField sfield = schema.getFieldOrNull(fieldName);
        FieldType ftype = (sfield == null) ? null : sfield.getType();

        fieldMap.add("type", (ftype == null) ? null : ftype.getTypeName());
        fieldMap.add("schema", getFieldFlags(sfield));
        if (sfield != null && schema.isDynamicField(sfield.getName())
                && schema.getDynamicPattern(sfield.getName()) != null) {
            fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
        }
        Terms terms = reader.fields().terms(fieldName);
        if (terms == null) { // Not indexed, so we need to report what we can (it made it through the fl param if specified)
            finfo.add(fieldName, fieldMap);
            continue;
        }

        if (sfield != null && sfield.indexed()) {
            // In the pre-4.0 days, this did a veeeery expensive range query. But we can be much faster now,
            // so just do this all the time.
            Document doc = getFirstLiveDoc(terms, reader);

            if (doc != null) {
                // Found a document with this field
                try {
                    IndexableField fld = doc.getField(fieldName);
                    if (fld != null) {
                        fieldMap.add("index", getFieldFlags(fld));
                    } else {
                        // it is a non-stored field...
                        fieldMap.add("index", "(unstored field)");
                    }
                } catch (Exception ex) {
                    log.warn("error reading field: " + fieldName);
                }
            }
            fieldMap.add("docs", terms.getDocCount());

        }
        if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) {
            getDetailedFieldInfo(req, fieldName, fieldMap);
        }
        // Add the field
        finfo.add(fieldName, fieldMap);
    }
    return finfo;
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java

License:Open Source License

private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req) throws Exception {

    SolrIndexSearcher searcher = req.getSearcher();
    SolrParams params = req.getParams();

    Set<String> fields = null;
    String fl = params.get(CommonParams.FL);
    if (fl != null) {
        fields = new TreeSet<>(Arrays.asList(fl.split("[,\\s]+")));
    }/*from   w ww.  j av a2s.  c  o  m*/

    LeafReader reader = searcher.getSlowAtomicReader();
    IndexSchema schema = searcher.getSchema();

    // Don't be tempted to put this in the loop below, the whole point here
    // is to alphabetize the fields!
    Set<String> fieldNames = new TreeSet<>();
    for (FieldInfo fieldInfo : reader.getFieldInfos()) {
        fieldNames.add(fieldInfo.name);
    }

    // Walk the term enum and keep a priority queue for each map in our set
    SimpleOrderedMap<Object> vInfo = new SimpleOrderedMap<>();
    SimpleOrderedMap<Object> aInfo = new SimpleOrderedMap<>();

    for (String fieldName : fieldNames) {
        if (fields != null && !fields.contains(fieldName) && !fields.contains("*")) {
            continue; // we're not interested in this field Still an issue
                      // here
        }

        SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<>();

        SchemaField sfield = schema.getFieldOrNull(fieldName);
        FieldType ftype = (sfield == null) ? null : sfield.getType();

        fieldMap.add("type", (ftype == null) ? null : ftype.getTypeName());
        fieldMap.add("schema", getFieldFlags(sfield));
        if (sfield != null && schema.isDynamicField(sfield.getName())
                && schema.getDynamicPattern(sfield.getName()) != null) {
            fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
        }
        Terms terms = reader.fields().terms(fieldName);
        if (terms == null) { // Not indexed, so we need to report what we
                             // can (it made it through the fl param if
                             // specified)
            vInfo.add(AlfrescoSolrDataModel.getInstance().getAlfrescoPropertyFromSchemaField(fieldName),
                    fieldMap);
            aInfo.add(fieldName, fieldMap);
            continue;
        }

        if (sfield != null && sfield.indexed()) {
            if (params.getBool(INCLUDE_INDEX_FIELD_FLAGS, true)) {
                Document doc = getFirstLiveDoc(terms, reader);

                if (doc != null) {
                    // Found a document with this field
                    try {
                        IndexableField fld = doc.getField(fieldName);
                        if (fld != null) {
                            fieldMap.add("index", getFieldFlags(fld));
                        } else {
                            // it is a non-stored field...
                            fieldMap.add("index", "(unstored field)");
                        }
                    } catch (Exception ex) {
                        log.warn("error reading field: " + fieldName);
                    }
                }
            }
            fieldMap.add("docs", terms.getDocCount());

        }
        if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) {
            getDetailedFieldInfo(req, fieldName, fieldMap);
        }
        // Add the field
        vInfo.add(fieldName, fieldMap);
        aInfo.add(AlfrescoSolrDataModel.getInstance().getAlfrescoPropertyFromSchemaField(fieldName), fieldMap);
    }

    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();
    finfo.addAll(vInfo);
    // finfo.add("mimetype()", finfo.get("cm:content.mimetype"));
    // finfo.add("contentSize()", finfo.get("cm:content.size"));
    finfo.addAll(aInfo);
    return finfo;
}