Example usage for org.apache.lucene.search.similarities ClassicSimilarity ClassicSimilarity

List of usage examples for org.apache.lucene.search.similarities ClassicSimilarity ClassicSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.search.similarities ClassicSimilarity ClassicSimilarity.

Prototype

public ClassicSimilarity() 

Source Link

Document

Sole constructor: parameter-free

Usage

From source file:IrqaQuery.java

License:Apache License

public static void makeIndexWriter(String indexPath, String stopPath, String sim) throws IOException {
    System.out.println("[makeIndexWriter] started");
    System.out.println("[makeIndexWriter]" + stopPath);
    Directory dir = FSDirectory.open(Paths.get(indexPath));
    Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stopPath)));
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

    if (sim.equals("TFIDF"))
        iwc.setSimilarity(new ClassicSimilarity());
    else if (sim.equals("BM25"))
        iwc.setSimilarity(new BM25Similarity());
    else/*www .j a  v  a  2  s  . c o  m*/
        iwc.setSimilarity(new BM25Similarity());

    writer = new IndexWriter(dir, iwc);
}

From source file:IrqaQuery.java

License:Apache License

public static List<Document> query(String index, String stoppath, String question, int numResult, String sim)
        throws Exception {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);

    Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath)));

    if (sim.equals("TFIDF"))
        searcher.setSimilarity(new ClassicSimilarity());
    else if (sim.equals("BM25"))
        searcher.setSimilarity(new BM25Similarity());
    else//  w  ww. j a  va2 s  . c o  m
        searcher.setSimilarity(new BM25Similarity());

    String field = "contents";
    QueryParser parser = new QueryParser(field, analyzer);
    Query query = parser.parse(parser.escape(question));

    TopDocs results = searcher.search(query, numResult);
    ScoreDoc[] hits = results.scoreDocs;
    List<Document> docs = new ArrayList<Document>();

    int numTotalHits = results.totalHits;
    //        System.out.println(numTotalHits + " total matching documents");

    int end = Math.min(numTotalHits, numResult);

    String searchResult = "";
    //        System.out.println("Only results 1 - " + hits.length);

    for (int i = 0; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        docs.add(doc);
    }

    return docs;
}

From source file:KNearestNeighborClassifier.java

License:Apache License

/**
 * Creates a {@link KNearestNeighborClassifier}.
 *
 * @param leafReader     the reader on the index to be used for classification
 * @param analyzer       an {@link Analyzer} used to analyze unseen text
 * @param similarity     the {@link Similarity} to be used by the underlying {@link IndexSearcher} or {@code null}
 *                       (defaults to {@link org.apache.lucene.search.similarities.ClassicSimilarity})
 * @param query          a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
 *                       if all the indexed docs should be used
 * @param k              the no. of docs to select in the MLT results to find the nearest neighbor
 * @param minDocsFreq    {@link MoreLikeThis#minDocFreq} parameter
 * @param minTermFreq    {@link MoreLikeThis#minTermFreq} parameter
 * @param classFieldName the name of the field used as the output for the classifier
 * @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10
 *///from   www .j  a v a2s.c o m
public KNearestNeighborClassifier(IndexReader leafReader, Similarity similarity, Analyzer analyzer, Query query,
        int k, int minDocsFreq, int minTermFreq, String classFieldName, String... textFieldNames) {
    this.textFieldNames = textFieldNames;
    this.classFieldName = classFieldName;
    this.mlt = new MoreLikeThis(leafReader);
    this.mlt.setAnalyzer(analyzer);
    this.mlt.setFieldNames(textFieldNames);
    this.indexSearcher = new IndexSearcher(leafReader);
    if (similarity != null) {
        this.indexSearcher.setSimilarity(similarity);
    } else {
        this.indexSearcher.setSimilarity(new ClassicSimilarity());
    }
    if (minDocsFreq > 0) {
        mlt.setMinDocFreq(minDocsFreq);
    }
    if (minTermFreq > 0) {
        mlt.setMinTermFreq(minTermFreq);
    }
    this.query = query;
    this.k = k;
}

From source file:luceneInterface.java

License:Apache License

public static List<Document> query(String index, String stoppath, String question, int numResult, String sim)
        throws Exception {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);

    Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath)));

    if (sim.equals("TFIDF"))
        searcher.setSimilarity(new ClassicSimilarity());
    else if (sim.equals("BM25"))
        searcher.setSimilarity(new BM25Similarity());
    else//from   w  w w  .j  av a 2 s . co m
        searcher.setSimilarity(new BM25Similarity());

    String field = "contents";
    QueryParser parser = new QueryParser(field, analyzer);
    Query query = parser.parse(parser.escape(question));

    BooleanQuery.Builder bqb = new BooleanQuery.Builder();
    bqb.add(new TermQuery(new Term("contents", parser.escape(question))), BooleanClause.Occur.SHOULD);
    bqb.add(new TermQuery(new Term("sec", parser.escape(question))), BooleanClause.Occur.SHOULD);

    //        Term term = new Term(field, question);
    //        Query query = new TermQuery(term);

    //        TopDocs results = searcher.search(query, numResult);
    TopDocs results = searcher.search(parser.parse(bqb.build().toString()), numResult);

    ScoreDoc[] hits = results.scoreDocs;
    List<Document> docs = new ArrayList<Document>();

    int numTotalHits = results.totalHits;
    //        System.out.println(numTotalHits + " total matching documents");

    int end = Math.min(numTotalHits, numResult);

    String searchResult = "";
    //        System.out.println("Only results 1 - " + hits.length);

    for (int i = 0; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        docs.add(doc);
    }

    return docs;
}

From source file:ai.castor.idf.FetchTermIDF.java

License:Apache License

public double getTermIDF(String term) throws ParseException {
    Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String esTerm = qp.escape(term);
    double termIDF = 0.0;
    try {/* ww w  .ja v a 2s .  c  o  m*/
        TermQuery q = (TermQuery) qp.parse(esTerm);
        Term t = q.getTerm();
        termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());

        System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF);
    } catch (Exception e) {
        System.err.println("Exception in fetching IDF(" + term + "): " + e.toString());
    }
    return termIDF;
}

From source file:ai.castor.idf.IDFScorer.java

License:Apache License

public double calcIDF(String query, String answer, boolean analyze) throws ParseException {
    Analyzer analyzer;//  ww w  .ja v a 2  s  . c o m
    if (analyze) {
        analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    } else {
        analyzer = new WhitespaceAnalyzer();
    }

    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+")));

    double idf = 0.0;
    HashSet<String> seenTerms = new HashSet<>();

    String[] terms = answer.split("\\s+");
    for (String term : terms) {
        try {
            TermQuery q = (TermQuery) qp.parse(term);
            Term t = q.getTerm();

            if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                idf += similarity.idf(reader.docFreq(t), reader.numDocs());
                seenTerms.add(t.toString());
            } else {
                idf += 0.0;
            }
        } catch (Exception e) {
            continue;
        }
    }
    return idf;
}

From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java

License:Apache License

@Test
public void test20Newsgroups() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {//from w w w  .jav a2 s.c  o m
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    IndexReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    List<Classifier<BytesRef>> classifiers = new LinkedList<>();
    try {
        Analyzer analyzer = new StandardAnalyzer();
        if (index) {

            System.out.format("Indexing 20 Newsgroups...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            buildIndex(new File(PREFIX + "/20n/20_newsgroups"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            datasetSplitter.split(reader, train, test, cv, analyzer, false, CATEGORY_FIELD, BODY_FIELD,
                    SUBJECT_FIELD, CATEGORY_FIELD);
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 3, 1, 1, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 30, 3, 300));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {
            testClassifier(reader, startTime, testReader, service, futures, classifier);
        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        if (reader != null) {
            reader.close();
        }
        directory.close();
        if (test != null) {
            test.close();
        }
        if (train != null) {
            train.close();
        }
        if (cv != null) {
            cv.close();
        }
        if (testReader != null) {
            testReader.close();
        }

        for (Classifier c : classifiers) {
            if (c instanceof Closeable) {
                ((Closeable) c).close();
            }
        }
    }
}

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

@Test
public void testItalianWikipedia() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {//  w  w w  .  ja v  a2s. c  o  m
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    DirectoryReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    try {
        Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo",
                "la", "i", "gli", "le");
        CharArraySet stopWords = new CharArraySet(stopWordsList, true);
        Analyzer analyzer = new ItalianAnalyzer(stopWords);
        if (index) {

            System.out.format("Indexing Italian Wikipedia...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            for (LeafReaderContext context : reader.leaves()) {
                datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD,
                        TEXT_FIELD, CATEGORY_FIELD);
            }
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        List<Classifier<BytesRef>> classifiers = new LinkedList<>();
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {

            final IndexReader finalReader = reader;
            final DirectoryReader finalTestReader = testReader;
            futures.add(service.submit(() -> {
                ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix;
                if (split) {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                } else {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                }

                final long endTime = System.currentTimeMillis();
                final int elapse = (int) (endTime - startTime) / 1000;

                return " * " + classifier + " \n    * accuracy = " + confusionMatrix.getAccuracy()
                        + "\n    * precision = " + confusionMatrix.getPrecision() + "\n    * recall = "
                        + confusionMatrix.getRecall() + "\n    * f1-measure = " + confusionMatrix.getF1Measure()
                        + "\n    * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime()
                        + "\n    * time = " + elapse + " (sec)\n ";
            }));

        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
            if (directory != null) {
                directory.close();
            }
            if (test != null) {
                test.close();
            }
            if (train != null) {
                train.close();
            }
            if (cv != null) {
                cv.close();
            }
            if (testReader != null) {
                testReader.close();
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

From source file:com.mycompany.lucenedemo.SearchFiles.java

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage] [-sim vsm or bm25]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details.";
    if (args.length == 0 || (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0])))) {
        System.out.println(usage);
        System.exit(0);/* ww w .  j a  va2 s  . c  o m*/
    }

    String index = "index";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;
    SimilarityScore score = SimilarityScore.DEFAULT;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) {
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-paging".equals(args[i])) {
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        } else if ("-sim".equals(args[i])) {
            if (args[i + 1].equals("vsm")) {
                score = SimilarityScore.VSM;
            } else {
                score = SimilarityScore.BM25;
            }
        }
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer();

    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(field, analyzer);

    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));

        switch (score) {
        case DEFAULT:
            break;
        case VSM:
            searcher.setSimilarity(new ClassicSimilarity());
            break;
        case BM25:
            searcher.setSimilarity(new BM25Similarity());
            break;
        }

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}

From source file:com.o19s.es.explore.ExplorerQuery.java

License:Apache License

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    if (!needsScores) {
        return searcher.createWeight(query, false, boost);
    }//w  ww. j a  v  a  2  s.  c  o m
    final Weight subWeight = searcher.createWeight(query, true, boost);
    Set<Term> terms = new HashSet<>();
    subWeight.extractTerms(terms);
    if (isCollectionScoped()) {
        ClassicSimilarity sim = new ClassicSimilarity();
        StatisticsHelper df_stats = new StatisticsHelper();
        StatisticsHelper idf_stats = new StatisticsHelper();
        StatisticsHelper ttf_stats = new StatisticsHelper();

        for (Term term : terms) {
            TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term);
            TermStatistics tStats = searcher.termStatistics(term, ctx);
            df_stats.add(tStats.docFreq());
            idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs()));
            ttf_stats.add(tStats.totalTermFreq());
        }

        /*
        If no terms are parsed in the query we opt for returning 0
        instead of throwing an exception that could break various
        pipelines.
         */
        float constantScore;

        if (terms.size() > 0) {
            switch (type) {
            case ("sum_classic_idf"):
                constantScore = idf_stats.getSum();
                break;
            case ("mean_classic_idf"):
                constantScore = idf_stats.getMean();
                break;
            case ("max_classic_idf"):
                constantScore = idf_stats.getMax();
                break;
            case ("min_classic_idf"):
                constantScore = idf_stats.getMin();
                break;
            case ("stddev_classic_idf"):
                constantScore = idf_stats.getStdDev();
                break;
            case "sum_raw_df":
                constantScore = df_stats.getSum();
                break;
            case "min_raw_df":
                constantScore = df_stats.getMin();
                break;
            case "max_raw_df":
                constantScore = df_stats.getMax();
                break;
            case "mean_raw_df":
                constantScore = df_stats.getMean();
                break;
            case "stddev_raw_df":
                constantScore = df_stats.getStdDev();
                break;
            case "sum_raw_ttf":
                constantScore = ttf_stats.getSum();
                break;
            case "min_raw_ttf":
                constantScore = ttf_stats.getMin();
                break;
            case "max_raw_ttf":
                constantScore = ttf_stats.getMax();
                break;
            case "mean_raw_ttf":
                constantScore = ttf_stats.getMean();
                break;
            case "stddev_raw_ttf":
                constantScore = ttf_stats.getStdDev();
                break;
            case "unique_terms_count":
                constantScore = terms.size();
                break;

            default:
                throw new RuntimeException("Invalid stat type specified.");
            }
        } else {
            constantScore = 0.0f;
        }

        return new ConstantScoreWeight(ExplorerQuery.this, constantScore) {

            @Override
            public Explanation explain(LeafReaderContext context, int doc) throws IOException {
                Scorer scorer = scorer(context);
                int newDoc = scorer.iterator().advance(doc);
                assert newDoc == doc; // this is a DocIdSetIterator.all
                return Explanation.match(scorer.score(), "Stat Score: " + type);
            }

            @Override
            public Scorer scorer(LeafReaderContext context) throws IOException {
                return new ConstantScoreScorer(this, constantScore,
                        DocIdSetIterator.all(context.reader().maxDoc()));
            }

            @Override
            public boolean isCacheable(LeafReaderContext ctx) {
                return true;
            }

        };
    } else if (type.endsWith("_raw_tf")) {
        // Rewrite this into a boolean query where we can inject our PostingsExplorerQuery
        BooleanQuery.Builder qb = new BooleanQuery.Builder();
        for (Term t : terms) {
            qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF),
                    BooleanClause.Occur.SHOULD));
        }
        // FIXME: completely refactor this class and stop accepting a random query but a list of terms directly
        // rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms
        // problem is that we rely on extractTerms which happen too late in the process
        Query q = qb.build().rewrite(searcher.getIndexReader());
        return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type);
    }
    throw new IllegalArgumentException("Unknown ExplorerQuery type [" + type + "]");
}