List of usage examples for org.apache.lucene.classification ClassificationResult ClassificationResult
public ClassificationResult(T assignedClass, double score)
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Calculate probabilities for all classes for a given input text * @param inputDocument the input text as a {@code String} * @return a {@code List} of {@code ClassificationResult}, one for each existing class * @throws IOException if assigning probabilities fails */// w ww . ja v a2s .c o m protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef next; String[] tokenizedText = tokenize(inputDocument); int docsWithClassSize = countDocsWithClass(); while ((next = classesEnum.next()) != null) { if (next.length > 0) { Term term = new Term(this.classFieldName, next); double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize); assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal)); } } // normalization; the values transforms to a 0-1 range return normClassificationResults(assignedClasses); }
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Normalize the classification results based on the max score available * @param assignedClasses the list of assigned classes * @return the normalized results/* w w w. j a va2s. co m*/ */ protected ArrayList<ClassificationResult<BytesRef>> normClassificationResults( List<ClassificationResult<BytesRef>> assignedClasses) { // normalization; the values transforms to a 0-1 range ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>(); if (!assignedClasses.isEmpty()) { Collections.sort(assignedClasses); // this is a negative number closest to 0 = a double smax = assignedClasses.get(0).getScore(); double sumLog = 0; // log(sum(exp(x_n-a))) for (ClassificationResult<BytesRef> cr : assignedClasses) { // getScore-smax <=0 (both negative, smax is the smallest abs() sumLog += Math.exp(cr.getScore() - smax); } // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n))) double loga = smax; loga += Math.log(sumLog); // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum)) for (ClassificationResult<BytesRef> cr : assignedClasses) { double scoreDiff = cr.getScore() - loga; returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff))); } } return returnList; }
From source file:KNearestNeighborClassifier.java
License:Apache License
/** * build a list of classification results from search results * @param topDocs the search results as a {@link TopDocs} object * @return a {@link List} of {@link ClassificationResult}, one for each existing class * @throws IOException if it's not possible to get the stored value of class field */// ww w .java 2 s.c o m protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException { Map<BytesRef, Integer> classCounts = new HashMap<>(); Map<BytesRef, Double> classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs float maxScore = topDocs.getMaxScore(); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName); if (storableField != null) { BytesRef cl = new BytesRef(storableField.stringValue()); //update count Integer count = classCounts.get(cl); if (count != null) { classCounts.put(cl, count + 1); } else { classCounts.put(cl, 1); } //update boost, the boost is based on the best score Double totalBoost = classBoosts.get(cl); double singleBoost = scoreDoc.score / maxScore; if (totalBoost != null) { classBoosts.put(cl, totalBoost + singleBoost); } else { classBoosts.put(cl, singleBoost); } } } List<ClassificationResult<BytesRef>> returnList = new ArrayList<>(); List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>(); int sumdoc = 0; for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) { Integer count = entry.getValue(); Double normBoost = classBoosts.get(entry.getKey()) / count; //the boost is normalized to be 0<b<1 temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k)); sumdoc += count; } //correction if (sumdoc < k) { for (ClassificationResult<BytesRef> cr : temporaryList) { returnList.add( new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc)); } } else { returnList = temporaryList; } return returnList; }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>(); Map<String, Float> fieldName2boost = new LinkedHashMap<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef c;/* w ww.j av a 2 s . c o m*/ analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost); int docsWithClassSize = countDocsWithClass(); while ((c = classesEnum.next()) != null) { double classScore = 0; Term term = new Term(this.classFieldName, c); for (String fieldName : textFieldNames) { List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName); double fieldScore = 0; for (String[] fieldTokensArray : tokensArrays) { fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName); } classScore += fieldScore; } assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore)); } return normClassificationResults(assignedClasses); }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
List<ClassificationResult<BytesRef>> buildListFromTopDocs(IndexSearcher searcher, String categoryFieldName,
TopDocs topDocs, int k) throws IOException {
Map<BytesRef, Integer> classCounts = new HashMap<>();
Map<BytesRef, Double> classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs
float maxScore = topDocs.getMaxScore();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
IndexableField storableField = searcher.doc(scoreDoc.doc).getField(categoryFieldName);
if (storableField != null) {
BytesRef cl = new BytesRef(storableField.stringValue());
//update count
Integer count = classCounts.get(cl);
if (count != null) {
classCounts.put(cl, count + 1);
} else {
classCounts.put(cl, 1);/*from w w w. j a v a2 s.c om*/
}
//update boost, the boost is based on the best score
Double totalBoost = classBoosts.get(cl);
double singleBoost = scoreDoc.score / maxScore;
if (totalBoost != null) {
classBoosts.put(cl, totalBoost + singleBoost);
} else {
classBoosts.put(cl, singleBoost);
}
}
}
List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
int sumdoc = 0;
for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
Integer count = entry.getValue();
Double normBoost = classBoosts.get(entry.getKey()) / count; //the boost is normalized to be 0<b<1
temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
sumdoc += count;
}
//correction
if (sumdoc < k) {
for (ClassificationResult<BytesRef> cr : temporaryList) {
returnList.add(
new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
}
} else {
returnList = temporaryList;
}
return returnList;
}
From source file:com.github.tteofili.looseen.QueryingClassifier.java
License:Apache License
@Override public ClassificationResult<BytesRef> assignClass(String text) throws IOException { ClassificationResult<BytesRef> result = null; for (Map.Entry<String, Query> entry : queriesPerClass.entrySet()) { TopDocs search = indexSearcher.search(entry.getValue(), 1); float score; if (useCounts) { score = search.totalHits;/* ww w. j av a 2 s . com*/ } else { score = search.getMaxScore(); } if (result == null) { result = new ClassificationResult<>(new BytesRef(entry.getKey()), score); } else if (score > result.getScore()) { result = new ClassificationResult<>(new BytesRef(entry.getKey()), score); } } return result; }
From source file:Others.SampleLuceneClassifier.java
/** * Calculate probabilities for all classes for a given input text * @param inputDocument the input text as a {@code String} * @return a {@code List} of {@code ClassificationResult}, one for each existing class * @throws IOException if assigning probabilities fails *///from w w w .j a va 2s. com protected List<ClassificationResult<BytesRef>> assignClassListJaccard(String inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Terms classes = MultiFields.getTerms(indexReader, classFieldName); if (classes != null) { TermsEnum classesEnum = classes.iterator(); BytesRef next; String[] tokenizedText = tokenize(inputDocument); int docsWithClassSize = countDocsWithClass(); while ((next = classesEnum.next()) != null) { if (next.length > 0) { Term term = new Term(this.classFieldName, next); double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize); assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal)); } } } // normalization; the values transforms to a 0-1 range return normClassificationResults(assignedClasses); }