Example usage for org.apache.lucene.analysis CharArraySet CharArraySet

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet CharArraySet.

Prototype

public CharArraySet(Collection<?> c, boolean ignoreCase)

Source Link

Document

Creates a set from a Collection of objects.

Usage

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

@Test
public void testItalianWikipedia() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {/*from   ww  w  . java 2s  .co m*/
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    DirectoryReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    try {
        Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo",
                "la", "i", "gli", "le");
        CharArraySet stopWords = new CharArraySet(stopWordsList, true);
        Analyzer analyzer = new ItalianAnalyzer(stopWords);
        if (index) {

            System.out.format("Indexing Italian Wikipedia...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            for (LeafReaderContext context : reader.leaves()) {
                datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD,
                        TEXT_FIELD, CATEGORY_FIELD);
            }
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        List<Classifier<BytesRef>> classifiers = new LinkedList<>();
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {

            final IndexReader finalReader = reader;
            final DirectoryReader finalTestReader = testReader;
            futures.add(service.submit(() -> {
                ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix;
                if (split) {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                } else {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                }

                final long endTime = System.currentTimeMillis();
                final int elapse = (int) (endTime - startTime) / 1000;

                return " * " + classifier + " \n    * accuracy = " + confusionMatrix.getAccuracy()
                        + "\n    * precision = " + confusionMatrix.getPrecision() + "\n    * recall = "
                        + confusionMatrix.getRecall() + "\n    * f1-measure = " + confusionMatrix.getF1Measure()
                        + "\n    * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime()
                        + "\n    * time = " + elapse + " (sec)\n ";
            }));

        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
            if (directory != null) {
                directory.close();
            }
            if (test != null) {
                test.close();
            }
            if (train != null) {
                train.close();
            }
            if (cv != null) {
                cv.close();
            }
            if (testReader != null) {
                testReader.close();
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

From source file:com.shaie.annots.annotator.OneWordAnnotator.java

License:Apache License

public OneWordAnnotator(String... words) {
    Objects.requireNonNull(words, "words cannot be null");
    for (final String word : words) {
        Objects.requireNonNull(word, "Word cannot be null");
    }/*from  w w w  .j  a v  a 2 s  . c  o m*/
    this.words = new CharArraySet(Arrays.asList(words), true);
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) {
        stubAnnotator(TWO);//  w ww .  ja v  a2 s.  co m
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(TWO, 1));
    }
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_token_when_underlying_stream_skips_multiple_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE, THREE), false));
            TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) {
        stubAnnotator(TWO, FOUR);/*  ww w .j  a  v a  2s .  com*/
        tok.setReader(new StringReader(ONE_TWO_THREE_FOUR));
        assertTokenInfos(f, new TokenInfo(TWO, 1), new TokenInfo(FOUR, 3));
    }
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new PreAnnotatedTokenFilter(stop, 1, 1)) {
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 1, 1), new TokenInfo(TWO, 1));
    }//from  ww  w .  j ava  2s. c  o  m
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_token_when_underlying_stream_skips_multiple_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE, THREE), false));
            TokenFilter f = new PreAnnotatedTokenFilter(stop, 1, 1, 3, 1)) {
        tok.setReader(new StringReader(ONE_TWO_THREE_FOUR));
        assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 1, 1), new TokenInfo(TWO, 1),
                new TokenInfo(ANY_ANNOTATION_TERM, 3, 1), new TokenInfo(FOUR, 3));
    }/*from  ww w . j  ava2  s  . c  om*/
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_annotated_tokens_are_filtered() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(TWO), false));
            TokenFilter f = new PreAnnotatedTokenFilter(stop, 0, 1, 1, 1, 0, 3)) {
        tok.setReader(new StringReader(ONE_TWO_THREE));
        assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 0, 3), new TokenInfo(ONE, 0),
                new TokenInfo(THREE, 2));
    }//from  ww w .j a v a 2s  . c  o  m
}

From source file:com.shaie.annots.filter.SimplePreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false));
            TokenFilter f = new SimplePreAnnotatedTokenFilter(stop, 1, 1)) {
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(TWO, 1));
    }//from ww  w .java2 s . c  om
}

From source file:com.shaie.annots.filter.SimplePreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_token_when_underlying_stream_skips_multiple_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE, THREE), false));
            TokenFilter f = new SimplePreAnnotatedTokenFilter(stop, 1, 1, 3, 1)) {
        tok.setReader(new StringReader(ONE_TWO_THREE_FOUR));
        assertTokenInfos(f, new TokenInfo(TWO, 1), new TokenInfo(FOUR, 3));
    }// w w  w . ja  va  2  s . c om
}

From source file:com.shaie.annots.filter.SimplePreAnnotatedTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_annotated_tokens_are_filtered() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer();
            TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(TWO), false));
            TokenFilter f = new SimplePreAnnotatedTokenFilter(stop, 0, 1, 1, 1, 0, 3)) {
        tok.setReader(new StringReader(ONE_TWO_THREE));
        assertTokenInfos(f, new TokenInfo(ONE, 0), new TokenInfo(THREE, 2));
    }// ww  w.  java  2  s. c om
}