Example usage for org.apache.lucene.analysis CharArraySet CharArraySet

List of usage examples for org.apache.lucene.analysis CharArraySet CharArraySet

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet CharArraySet.

Prototype

public CharArraySet(Collection<?> c, boolean ignoreCase) 

Source Link

Document

Creates a set from a Collection of objects.

Usage

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

@Test
public void testItalianWikipedia() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {/*from   ww  w  . java 2s  .co m*/
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    DirectoryReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    try {
        Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo",
                "la", "i", "gli", "le");
        CharArraySet stopWords = new CharArraySet(stopWordsList, true);
        Analyzer analyzer = new ItalianAnalyzer(stopWords);
        if (index) {

            System.out.format("Indexing Italian Wikipedia...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            for (LeafReaderContext context : reader.leaves()) {
                datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD,
                        TEXT_FIELD, CATEGORY_FIELD);
            }
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        List<Classifier<BytesRef>> classifiers = new LinkedList<>();
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {

            final IndexReader finalReader = reader;
            final DirectoryReader finalTestReader = testReader;
            futures.add(service.submit(() -> {
                ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix;
                if (split) {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                } else {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                }

                final long endTime = System.currentTimeMillis();
                final int elapse = (int) (endTime - startTime) / 1000;

                return " * " + classifier + " \n    * accuracy = " + confusionMatrix.getAccuracy()
                        + "\n    * precision = " + confusionMatrix.getPrecision() + "\n    * recall = "
                        + confusionMatrix.getRecall() + "\n    * f1-measure = " + confusionMatrix.getF1Measure()
                        + "\n    * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime()
                        + "\n    * time = " + elapse + " (sec)\n ";
            }));

        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
            if (directory != null) {
                directory.close();
            }
            if (test != null) {
                test.close();
            }
            if (train != null) {
                train.close();
            }
            if (cv != null) {
                cv.close();
            }
            if (testReader != null) {
                testReader.close();
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

From source file:com.shaie.annots.annotator.OneWordAnnotator.java

License:Apache License

public OneWordAnnotator(String... words) {
    Objects.requireNonNull(words, "words cannot be null");
    for (final String word : words) {
        Objects.requireNonNull(word, "Word cannot be null");
    }/*from  w w w  .j  a v  a 2 s  . c  o m*/
    this.words = new CharArraySet(Arrays.asList(words), true);
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) {
        stubAnnotator(TWO);//  w ww .  ja v  a2 s.  co m
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(TWO, 1));
    }
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_token_when_underlying_stream_skips_multiple_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE, THREE), false));
            TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) {
        stubAnnotator(TWO, FOUR);/*  ww w .j  a  v a  2s .  com*/
        tok.setReader(new StringReader(ONE_TWO_THREE_FOUR));
        assertTokenInfos(f, new TokenInfo(TWO, 1), new TokenInfo(FOUR, 3));
    }
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new PreAnnotatedTokenFilter(stop, 1, 1)) {
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 1, 1), new TokenInfo(TWO, 1));
    }//from  ww  w .  j ava  2s. c  o  m
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_token_when_underlying_stream_skips_multiple_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE, THREE), false));
            TokenFilter f = new PreAnnotatedTokenFilter(stop, 1, 1, 3, 1)) {
        tok.setReader(new StringReader(ONE_TWO_THREE_FOUR));
        assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 1, 1), new TokenInfo(TWO, 1),
                new TokenInfo(ANY_ANNOTATION_TERM, 3, 1), new TokenInfo(FOUR, 3));
    }/*from  ww w . j  ava2  s  . c  om*/
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_annotated_tokens_are_filtered() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(TWO), false));
            TokenFilter f = new PreAnnotatedTokenFilter(stop, 0, 1, 1, 1, 0, 3)) {
        tok.setReader(new StringReader(ONE_TWO_THREE));
        assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 0, 3), new TokenInfo(ONE, 0),
                new TokenInfo(THREE, 2));
    }//from  ww w .j a v a 2s  . c  o  m
}

From source file:com.shaie.annots.filter.SimplePreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new SimplePreAnnotatedTokenFilter(stop, 1, 1)) {
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(TWO, 1));
    }//from ww  w .java2 s . c  om
}

From source file:com.shaie.annots.filter.SimplePreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_token_when_underlying_stream_skips_multiple_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE, THREE), false));
            TokenFilter f = new SimplePreAnnotatedTokenFilter(stop, 1, 1, 3, 1)) {
        tok.setReader(new StringReader(ONE_TWO_THREE_FOUR));
        assertTokenInfos(f, new TokenInfo(TWO, 1), new TokenInfo(FOUR, 3));
    }// w w  w . ja  va  2  s . c om
}

From source file:com.shaie.annots.filter.SimplePreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_annotated_tokens_are_filtered() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(TWO), false));
            TokenFilter f = new SimplePreAnnotatedTokenFilter(stop, 0, 1, 1, 1, 0, 3)) {
        tok.setReader(new StringReader(ONE_TWO_THREE));
        assertTokenInfos(f, new TokenInfo(ONE, 0), new TokenInfo(THREE, 2));
    }// ww  w.  java  2  s. c om
}