List of usage examples for org.apache.lucene.index LeafReader getSumDocFreq
@Override public final long getSumDocFreq(String field) throws IOException
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
/** * {@inheritDoc}//from w ww.j av a2s .com */ @Override public void train(LeafReader leafReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) throws IOException { this.textTerms = MultiFields.getTerms(leafReader, textFieldName); if (textTerms == null) { throw new IOException("term vectors need to be available for field " + textFieldName); } this.analyzer = analyzer; this.textFieldName = textFieldName; if (threshold == null || threshold == 0d) { // automatic assign a threshold long sumDocFreq = leafReader.getSumDocFreq(textFieldName); if (sumDocFreq != -1) { this.threshold = (double) sumDocFreq / 2d; } else { throw new IOException("threshold cannot be assigned since term vectors for field " + textFieldName + " do not exist"); } } // TODO : remove this map as soon as we have a writable FST SortedMap<String, Double> weights = new TreeMap<>(); TermsEnum termsEnum = textTerms.iterator(); BytesRef textTerm; while ((textTerm = termsEnum.next()) != null) { weights.put(textTerm.utf8ToString(), (double) termsEnum.totalTermFreq()); } updateFST(weights); IndexSearcher indexSearcher = new IndexSearcher(leafReader); int batchCount = 0; BooleanQuery q = new BooleanQuery(); q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, "*")), BooleanClause.Occur.MUST)); if (query != null) { q.add(new BooleanClause(query, BooleanClause.Occur.MUST)); } // run the search and use stored field values for (ScoreDoc scoreDoc : indexSearcher.search(q, Integer.MAX_VALUE).scoreDocs) { Document doc = indexSearcher.doc(scoreDoc.doc); IndexableField textField = doc.getField(textFieldName); // get the expected result IndexableField classField = doc.getField(classFieldName); if (textField != null && classField != null) { // assign class to the doc ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue()); Boolean assignedClass = classificationResult.getAssignedClass(); Boolean correctClass = Boolean.valueOf(classField.stringValue()); long modifier = correctClass.compareTo(assignedClass); if (modifier != 0) { updateWeights(leafReader, scoreDoc.doc, assignedClass, weights, modifier, batchCount % batchSize == 0); } batchCount++; } } weights.clear(); // free memory while waiting for GC }