Example usage for org.apache.lucene.index LeafReader getSumDocFreq

List of usage examples for org.apache.lucene.index LeafReader getSumDocFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index LeafReader getSumDocFreq.

Prototype

@Override
    public final long getSumDocFreq(String field) throws IOException 

Source Link

Usage

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

/**
 * {@inheritDoc}//from w ww.j  av  a2s .com
 */
@Override
public void train(LeafReader leafReader, String textFieldName, String classFieldName, Analyzer analyzer,
        Query query) throws IOException {
    this.textTerms = MultiFields.getTerms(leafReader, textFieldName);

    if (textTerms == null) {
        throw new IOException("term vectors need to be available for field " + textFieldName);
    }

    this.analyzer = analyzer;
    this.textFieldName = textFieldName;

    if (threshold == null || threshold == 0d) {
        // automatic assign a threshold
        long sumDocFreq = leafReader.getSumDocFreq(textFieldName);
        if (sumDocFreq != -1) {
            this.threshold = (double) sumDocFreq / 2d;
        } else {
            throw new IOException("threshold cannot be assigned since term vectors for field " + textFieldName
                    + " do not exist");
        }
    }

    // TODO : remove this map as soon as we have a writable FST
    SortedMap<String, Double> weights = new TreeMap<>();

    TermsEnum termsEnum = textTerms.iterator();
    BytesRef textTerm;
    while ((textTerm = termsEnum.next()) != null) {
        weights.put(textTerm.utf8ToString(), (double) termsEnum.totalTermFreq());
    }
    updateFST(weights);

    IndexSearcher indexSearcher = new IndexSearcher(leafReader);

    int batchCount = 0;

    BooleanQuery q = new BooleanQuery();
    q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, "*")), BooleanClause.Occur.MUST));
    if (query != null) {
        q.add(new BooleanClause(query, BooleanClause.Occur.MUST));
    }
    // run the search and use stored field values
    for (ScoreDoc scoreDoc : indexSearcher.search(q, Integer.MAX_VALUE).scoreDocs) {
        Document doc = indexSearcher.doc(scoreDoc.doc);

        IndexableField textField = doc.getField(textFieldName);

        // get the expected result
        IndexableField classField = doc.getField(classFieldName);

        if (textField != null && classField != null) {
            // assign class to the doc
            ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue());
            Boolean assignedClass = classificationResult.getAssignedClass();

            Boolean correctClass = Boolean.valueOf(classField.stringValue());
            long modifier = correctClass.compareTo(assignedClass);
            if (modifier != 0) {
                updateWeights(leafReader, scoreDoc.doc, assignedClass, weights, modifier,
                        batchCount % batchSize == 0);
            }
            batchCount++;
        }
    }
    weights.clear(); // free memory while waiting for GC
}