Example usage for org.apache.lucene.classification ClassificationResult ClassificationResult

List of usage examples for org.apache.lucene.classification ClassificationResult ClassificationResult

Introduction

In this page you can find the example usage for org.apache.lucene.classification ClassificationResult ClassificationResult.

Prototype

public ClassificationResult(T assignedClass, double score) 

Source Link

Document

Constructor

Usage

From source file:SimpleNaiveBayesClassifier.java

License:Apache License

/**
 * Calculate probabilities for all classes for a given input text
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 */// w ww  . ja  v  a2s  .c  o  m
protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument)
        throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

    Terms classes = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef next;
    String[] tokenizedText = tokenize(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    while ((next = classesEnum.next()) != null) {
        if (next.length > 0) {
            Term term = new Term(this.classFieldName, next);
            double clVal = calculateLogPrior(term, docsWithClassSize)
                    + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
            assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
        }
    }

    // normalization; the values transforms to a 0-1 range
    return normClassificationResults(assignedClasses);
}

From source file:SimpleNaiveBayesClassifier.java

License:Apache License

/**
 * Normalize the classification results based on the max score available
 * @param assignedClasses the list of assigned classes
 * @return the normalized results/* w  w  w.  j  a  va2s. co m*/
 */
protected ArrayList<ClassificationResult<BytesRef>> normClassificationResults(
        List<ClassificationResult<BytesRef>> assignedClasses) {
    // normalization; the values transforms to a 0-1 range
    ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    if (!assignedClasses.isEmpty()) {
        Collections.sort(assignedClasses);
        // this is a negative number closest to 0 = a
        double smax = assignedClasses.get(0).getScore();

        double sumLog = 0;
        // log(sum(exp(x_n-a)))
        for (ClassificationResult<BytesRef> cr : assignedClasses) {
            // getScore-smax <=0 (both negative, smax is the smallest abs()
            sumLog += Math.exp(cr.getScore() - smax);
        }
        // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
        double loga = smax;
        loga += Math.log(sumLog);

        // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
        for (ClassificationResult<BytesRef> cr : assignedClasses) {
            double scoreDiff = cr.getScore() - loga;
            returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff)));
        }
    }
    return returnList;
}

From source file:KNearestNeighborClassifier.java

License:Apache License

/**
 * build a list of classification results from search results
 * @param topDocs the search results as a {@link TopDocs} object
 * @return a {@link List} of {@link ClassificationResult}, one for each existing class
 * @throws IOException if it's not possible to get the stored value of class field
 *///  ww w  .java 2  s.c o m
protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException {
    Map<BytesRef, Integer> classCounts = new HashMap<>();
    Map<BytesRef, Double> classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs
    float maxScore = topDocs.getMaxScore();
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
        if (storableField != null) {
            BytesRef cl = new BytesRef(storableField.stringValue());
            //update count
            Integer count = classCounts.get(cl);
            if (count != null) {
                classCounts.put(cl, count + 1);
            } else {
                classCounts.put(cl, 1);
            }
            //update boost, the boost is based on the best score
            Double totalBoost = classBoosts.get(cl);
            double singleBoost = scoreDoc.score / maxScore;
            if (totalBoost != null) {
                classBoosts.put(cl, totalBoost + singleBoost);
            } else {
                classBoosts.put(cl, singleBoost);
            }
        }
    }
    List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
    int sumdoc = 0;
    for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
        Integer count = entry.getValue();
        Double normBoost = classBoosts.get(entry.getKey()) / count; //the boost is normalized to be 0<b<1
        temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
        sumdoc += count;
    }

    //correction
    if (sumdoc < k) {
        for (ClassificationResult<BytesRef> cr : temporaryList) {
            returnList.add(
                    new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
        }
    } else {
        returnList = temporaryList;
    }
    return returnList;
}

From source file:SimpleNaiveBayesDocumentClassifier.java

License:Apache License

private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
    Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>();
    Map<String, Float> fieldName2boost = new LinkedHashMap<>();
    Terms classes = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef c;/*  w  ww.j  av  a  2  s .  c  o  m*/

    analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost);

    int docsWithClassSize = countDocsWithClass();
    while ((c = classesEnum.next()) != null) {
        double classScore = 0;
        Term term = new Term(this.classFieldName, c);
        for (String fieldName : textFieldNames) {
            List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName);
            double fieldScore = 0;
            for (String[] fieldTokensArray : tokensArrays) {
                fieldScore += calculateLogPrior(term, docsWithClassSize)
                        + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize)
                                * fieldName2boost.get(fieldName);
            }
            classScore += fieldScore;
        }
        assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore));
    }
    return normClassificationResults(assignedClasses);
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

List<ClassificationResult<BytesRef>> buildListFromTopDocs(IndexSearcher searcher, String categoryFieldName,
        TopDocs topDocs, int k) throws IOException {
    Map<BytesRef, Integer> classCounts = new HashMap<>();
    Map<BytesRef, Double> classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs
    float maxScore = topDocs.getMaxScore();
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        IndexableField storableField = searcher.doc(scoreDoc.doc).getField(categoryFieldName);
        if (storableField != null) {
            BytesRef cl = new BytesRef(storableField.stringValue());
            //update count
            Integer count = classCounts.get(cl);
            if (count != null) {
                classCounts.put(cl, count + 1);
            } else {
                classCounts.put(cl, 1);/*from   w w  w. j  a v a2  s.c om*/
            }
            //update boost, the boost is based on the best score
            Double totalBoost = classBoosts.get(cl);
            double singleBoost = scoreDoc.score / maxScore;
            if (totalBoost != null) {
                classBoosts.put(cl, totalBoost + singleBoost);
            } else {
                classBoosts.put(cl, singleBoost);
            }
        }
    }
    List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
    int sumdoc = 0;
    for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
        Integer count = entry.getValue();
        Double normBoost = classBoosts.get(entry.getKey()) / count; //the boost is normalized to be 0<b<1
        temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
        sumdoc += count;
    }

    //correction
    if (sumdoc < k) {
        for (ClassificationResult<BytesRef> cr : temporaryList) {
            returnList.add(
                    new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
        }
    } else {
        returnList = temporaryList;
    }
    return returnList;
}

From source file:com.github.tteofili.looseen.QueryingClassifier.java

License:Apache License

@Override
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
    ClassificationResult<BytesRef> result = null;
    for (Map.Entry<String, Query> entry : queriesPerClass.entrySet()) {
        TopDocs search = indexSearcher.search(entry.getValue(), 1);
        float score;
        if (useCounts) {
            score = search.totalHits;/*  ww w.  j  av a  2  s . com*/
        } else {
            score = search.getMaxScore();
        }

        if (result == null) {
            result = new ClassificationResult<>(new BytesRef(entry.getKey()), score);
        } else if (score > result.getScore()) {
            result = new ClassificationResult<>(new BytesRef(entry.getKey()), score);
        }
    }
    return result;
}

From source file:Others.SampleLuceneClassifier.java

/**
 * Calculate probabilities for all classes for a given input text
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 *///from w  w  w .j  a  va  2s.  com
protected List<ClassificationResult<BytesRef>> assignClassListJaccard(String inputDocument) throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
    Terms classes = MultiFields.getTerms(indexReader, classFieldName);
    if (classes != null) {
        TermsEnum classesEnum = classes.iterator();
        BytesRef next;
        String[] tokenizedText = tokenize(inputDocument);
        int docsWithClassSize = countDocsWithClass();
        while ((next = classesEnum.next()) != null) {
            if (next.length > 0) {
                Term term = new Term(this.classFieldName, next);
                double clVal = calculateLogPrior(term, docsWithClassSize)
                        + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
                assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
            }
        }
    }
    // normalization; the values transforms to a 0-1 range
    return normClassificationResults(assignedClasses);
}