Example usage for org.apache.lucene.analysis BaseTokenStreamTestCase assertTokenStreamContents

List of usage examples for org.apache.lucene.analysis BaseTokenStreamTestCase assertTokenStreamContents

Introduction

In this page you can find the example usage for org.apache.lucene.analysis BaseTokenStreamTestCase assertTokenStreamContents.

Prototype

public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException 

Source Link

Usage

From source file:io.github.msurdi.redeye.core.lucene.RedeyeAnalizerTest.java

License:Apache License

@Test
public void testGeneratedTokens() throws IOException {
    for (String[] testCase : testValues) {
        String testValue = testCase[0];
        String expectedResult = testCase[1];
        BaseTokenStreamTestCase.assertTokenStreamContents(analyzer.tokenStream(FIELD, testValue),
                expectedResult.split(" "));
    }//from  w w w .  j av a2s  . c  om

}

From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java

License:Apache License

@Test
public void testStandardPlusTokenizer() throws Exception {
    try (Reader reader = new StringReader("Wha\u0301t's this thing do?")) {
        Map<String, String> args = new HashMap<String, String>();
        args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM,
                BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString());
        StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args);
        try (Tokenizer stream = factory.create(reader)) {
            BaseTokenStreamTestCase.assertTokenStreamContents(stream,
                    new String[] { "Wha\u0301t's", "this", "thing", "do", "?" });
        }//from w  ww  . j  a v a  2 s.  c o  m
    }
}

From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java

License:Apache License

@Test
public void testStandardPlusTokenizerMaxTokenLength() throws Exception {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < 100; ++i) {
        builder.append("abcdefg"); // 7 * 100 = 700 char "word"
    }/*from  w w w.  j  a v  a  2  s . com*/
    String longWord = builder.toString();
    String content = "one two three " + longWord + " four five six";
    try (Reader reader = new StringReader(content)) {
        Map<String, String> args = new HashMap<String, String>();
        args.put("maxTokenLength", "1000");
        args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM,
                BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString());
        StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args);
        try (Tokenizer stream = factory.create(reader)) {
            BaseTokenStreamTestCase.assertTokenStreamContents(stream,
                    new String[] { "one", "two", "three", longWord, "four", "five", "six" });
        }
    }
}

From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java

License:Apache License

@Test
public void testStandardPlusTokenizerNihongo() throws Exception {
    try (Reader reader = new StringReader("???   ? ?")) {
        Map<String, String> args = new HashMap<String, String>();
        args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM,
                BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString());
        StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args);
        try (Tokenizer stream = factory.create(reader)) {
            BaseTokenStreamTestCase.assertTokenStreamContents(stream, new String[] { "", "?", "", "",
                    "", "", "", "", "?", "?", "?", "?" });
        }/*  ww w  .j  a  va2 s  . com*/
    }
}

From source file:jp.co.mixi.rd.lucene.analysis.StandardPlusTokenizerFactoryTest.java

License:Apache License

@Test
public void testStandardPlusTokenizerSpace() throws Exception {
    try (Reader reader = new StringReader("?\n?")) {
        Map<String, String> args = new HashMap<String, String>();
        args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM,
                BaseTokenStreamTestCase.TEST_VERSION_CURRENT.toString());
        StandardPlusTokenizerFactory factory = new StandardPlusTokenizerFactory(args);
        try (Tokenizer stream = factory.create(reader)) {
            BaseTokenStreamTestCase.assertTokenStreamContents(stream, new String[] { "?", "?" });
        }// w w  w  .  j  a  va  2  s . co m
    }
}

From source file:org.apache.solr.analysis.ko.TestKoreanTokenizerFactory.java

License:Apache License

private void assertFilter(String input, String... tokens) throws IOException {
    TokenStream stream = filterK(tokenizerK(input));
    BaseTokenStreamTestCase.assertTokenStreamContents(stream, tokens);
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilterFactory.java

License:Apache License

@Test
public void testCustomTypes() throws Exception {
    String testText = "I borrowed $5,400.00 at 25% interest-rate";
    ResourceLoader loader = new SolrResourceLoader("solr/collection1");
    Map<String, String> args = new HashMap<String, String>();
    args.put("generateWordParts", "1");
    args.put("generateNumberParts", "1");
    args.put("catenateWords", "1");
    args.put("catenateNumbers", "1");
    args.put("catenateAll", "0");
    args.put("splitOnCaseChange", "1");

    /* default behavior */
    WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(args);
    factoryDefault.inform(loader);//from ww  w. ja  v a  2  s  .  c o m

    TokenStream ts = factoryDefault
            .create(new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "5", "400", "00",
            "540000", "at", "25", "interest", "rate", "interestrate" });

    ts = factoryDefault
            .create(new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo", "bar", "foobar" });

    /* custom behavior */
    args = new HashMap<String, String>();
    // use a custom type mapping
    args.put("generateWordParts", "1");
    args.put("generateNumberParts", "1");
    args.put("catenateWords", "1");
    args.put("catenateNumbers", "1");
    args.put("catenateAll", "0");
    args.put("splitOnCaseChange", "1");
    args.put("types", "wdftypes.txt");
    WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(args);
    factoryCustom.inform(loader);

    ts = factoryCustom.create(new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts,
            new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" });

    /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
    ts = factoryCustom
            .create(new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false));
    BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo\u200Dbar" });
}

From source file:org.elasticsearch.index.analysis.SimplePhoneticAnalysisTests.java

License:Apache License

public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("ABADIAS"));
    String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio",
            "abadioS", "abadios", "abodia", "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias",
            "avadios", "avodias", "avodios", "obadia", "obadiaS", "obadias", "obadio", "obadioS", "obadios",
            "obodia", "obodiaS", "obodias", "obodioS" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}

From source file:org.elasticsearch.index.analysis.SimplePhoneticAnalysisTests.java

License:Apache License

public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("Rimbault"));
    String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu",
            "rimbult", "rmbD", "rmbDlt", "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}

From source file:org.elasticsearch.index.analysis.SimplePhoneticAnalysisTests.java

License:Apache License

public void testPhoneticTokenFilterDaitchMotokoff() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("chauptman"));
    String[] expected = new String[] { "473660", "573660" };
    assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class));
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}