Example usage for org.apache.lucene.analysis MockTokenizer KEYWORD

List of usage examples for org.apache.lucene.analysis MockTokenizer KEYWORD

Introduction

In this page you can find the example usage for org.apache.lucene.analysis MockTokenizer KEYWORD.

Prototype

CharacterRunAutomaton KEYWORD

To view the source code for org.apache.lucene.analysis MockTokenizer KEYWORD.

Click Source Link

Document

Acts Similar to KeywordTokenizer.

Usage

From source file:org.apache.solr.analysis.MockTokenizerFactory.java

License:Apache License

/** Creates a new MockTokenizerFactory */
public MockTokenizerFactory(Map<String, String> args) {
    super(args);//  w ww . j a  va2 s.  c  o  m
    String patternArg = get(args, "pattern", Arrays.asList("keyword", "simple", "whitespace"));
    if ("keyword".equalsIgnoreCase(patternArg)) {
        pattern = MockTokenizer.KEYWORD;
    } else if ("simple".equalsIgnoreCase(patternArg)) {
        pattern = MockTokenizer.SIMPLE;
    } else {
        pattern = MockTokenizer.WHITESPACE;
    }

    enableChecks = getBoolean(args, "enableChecks", true);
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

From source file:org.apache.solr.spelling.TestSuggestSpellingConverter.java

License:Apache License

public void testSimple() throws Exception {
    // lowercases only!
    converter.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.KEYWORD, true));
    assertConvertsTo("This is a test", new String[] { "this is a test" });
}

From source file:org.elasticsearch.analysis.hunspell.TestStemming.java

License:Apache License

public void test() throws Exception {
    LineNumberReader reader = new LineNumberReader(IOUtils.getDecodingReader(
            getClass().getResourceAsStream("/stemming-data/" + language + ".txt"), StandardCharsets.UTF_8));
    dictionaryStream = getClass().getResourceAsStream("/" + language + "/" + language + ".dic");
    affixStream = getClass().getResourceAsStream("/" + language + "/" + language + ".aff");
    final Dictionary dictionary = new Dictionary(affixStream, dictionaryStream);
    Analyzer analyzer = new Analyzer() {
        @Override//from   www.  j  av a 2  s. c om
        protected TokenStreamComponents createComponents(String field) {
            MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
            HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, false);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    String line = null;
    while ((line = reader.readLine()) != null) {
        int comment = line.indexOf('#');
        if (comment >= 0) {
            line = line.substring(0, comment);
        }
        line = line.trim();
        if (line.isEmpty()) {
            continue;
        }
        String elements[] = line.split("\\s+");
        if (elements.length != 2) {
            throw new RuntimeException("Illegal number of elements in line: " + reader.getLineNumber());
        }
        String input = elements[0];
        String outputs[] = elements[1].split(",");
        compareStems(analyzer, input, outputs, reader.getLineNumber());
    }
    analyzer.close();
    reader.close();
}

From source file:org.elasticsearch.test.MockKeywordPlugin.java

License:Apache License

@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
    return singletonMap("keyword", (indexSettings, environment, name, settings) -> {
        class Factory implements TokenizerFactory {

            @Override/*from  w  w  w.  j  av  a2  s  .c o m*/
            public Tokenizer create() {
                return new MockTokenizer(MockTokenizer.KEYWORD, false);
            }
        }
        return new Factory();
    });
}

From source file:org.tallison.lucene.queryparser.spans.TestAdvancedAnalyzers.java

License:Apache License

@BeforeClass
public static void beforeClass() throws Exception {
    lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);

    Map<String, String> attrs = new HashMap<>();
    attrs.put("generateWordParts", "1");
    attrs.put("generateNumberParts", "1");
    attrs.put("catenateWords", "1");
    attrs.put("catenateNumbers", "1");
    attrs.put("catenateAll", "1");
    attrs.put("splitOnCaseChange", "1");
    attrs.put("preserveOriginal", "1");
    complexAnalyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(TestAdvancedAnalyzers.class))
            .withTokenizer("whitespace").addTokenFilter("worddelimiter", attrs).addTokenFilter("kstem")
            .addTokenFilter("removeduplicates").build();

    synAnalyzer = new Analyzer() {
        @Override/*from ww  w.  j  a v a2 s.c  o m*/
        public TokenStreamComponents createComponents(String fieldName) {

            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);

            filter = new MockSynFilter(filter);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockNonWhitespaceFilter(new MockSynFilter(in));
        }

    };

    baseAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockNonWhitespaceFilter(new LowerCaseFilter(in));
        }

    };

    ucVowelAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }
    };

    ucVowelMTAnalyzer = new Analyzer() {
        @Override
        public TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };

    Analyzer tmpUCVowelAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }
    };
    directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(baseAnalyzer)
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
    String[] docs = new String[] { "abc_def", "lmnop", "abc one", "abc two", "qrs one", "qrs two", "tuv one",
            "tuv two", "qrs tuv", "qrs_tuv" };
    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newTextField(FIELD1, docs[i], Field.Store.YES));
        TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES);
        tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i]));
        doc.add(tf);
        doc.add(newTextField(FIELD3, docs[i], Field.Store.YES));

        TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES);
        tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i]));
        doc.add(tf4);
        writer.addDocument(doc);
    }
    reader = writer.getReader();
    searcher = newSearcher(reader);
    writer.close();
}

From source file:org.tallison.lucene.queryparser.spans.TestOverallSpanQueryParser.java

License:Apache License

@BeforeClass
public static void beforeClass() throws Exception {

    ANALYZER = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
    MULTITERM_ANALYZER = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);

    DIRECTORY = newDirectory();/*  w  w  w.  j a  va2s.c  o  m*/

    RandomIndexWriter writer = new RandomIndexWriter(random(), DIRECTORY, newIndexWriterConfig(ANALYZER)
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));

    String[] f1Docs = new String[] { "quick brown AND fox", //0
            "quick brown AND dog", //1
            "quick brown dog", //2
            "whan that aprile with its shoures perced", //3
            "its shoures pierced", //4
            "its shoures perced", //5
            "#####", //before asterisk  //6
            "&&&&&", //after asterisk for range query //7
            "ab*de", //8
            "abcde", //9
            "blah disco fever blah", //10
            "blah bieber fever blah", //11
            "blah dengue fever blah", //12
            "blah saturday night fever with john travolta", //13
            "understanding (span query)", //14
            "understanding (sp'an query)", //15
            "understanding something about (span query)", //16
            "0 1 fox 3 4 5 fox 7 8 9 10 fox"//17

    };
    String[] f2Docs = new String[] { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
            "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen" };
    for (int i = 0; i < f1Docs.length; i++) {
        Document doc = new Document();
        doc.add(newTextField(FIELD1, f1Docs[i], Field.Store.YES));
        doc.add(newTextField(FIELD2, f2Docs[i], Field.Store.YES));
        writer.addDocument(doc);
    }
    READER = writer.getReader();
    SEARCHER = newSearcher(READER);
    writer.close();

    PARSER = new SpanQueryParser(FIELD1, ANALYZER, MULTITERM_ANALYZER);
}

From source file:org.tallison.lucene.queryparser.spans.TestQPTestBaseSpanQuery.java

License:Apache License

public CommonQueryParserConfiguration getParserConfig(Analyzer a, Analyzer mtAnalyzer) throws Exception {
    if (a == null) {
        a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
    }/*from w  ww .  j a v  a 2  s. c  o  m*/
    if (mtAnalyzer == null) {
        mtAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);
    }

    SQPTestingConfig qp = new SQPTestingConfig(getDefaultField(), a, mtAnalyzer);
    qp.setDefaultOperator(QueryParserBase.OR_OPERATOR);
    qp.setAnalyzeRangeTerms(true);
    return qp;
}

From source file:org.tallison.lucene.queryparser.spans.TestSpanOnlyQueryParser.java

License:Apache License

@BeforeClass
public static void beforeClass() throws Exception {

    lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);
    noopMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
    noStopAnalyzer = new Analyzer() {
        @Override/*w w w  . j a va2 s . c om*/
        public TokenStream normalize(String fieldName, TokenStream in) {
            return new MockStandardTokenizerFilter(new LowerCaseFilter(in));
        }

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
            TokenFilter filter = new MockStandardTokenizerFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };

    stopAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
            TokenFilter filter = new MockStandardTokenizerFilter(tokenizer);
            filter = new MockTokenFilter(filter, STOP_WORDS);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new LowerCaseFilter(in);
        }
    };

    directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(stopAnalyzer)
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
    String[] docs = new String[] { "the quick brown fox ",
            "jumped over the lazy brown dog and the brown green cat", "quick green fox", "abcdefghijk",
            "over green lazy",
            // longish doc for recursion test
            "eheu fugaces postume postume labuntur anni nec " + "pietas moram rugis et instanti senectae "
                    + "adferet indomitaeque morti",
            // non-whitespace language
            "\u666E \u6797 \u65AF \u987F \u5927 \u5B66", "reg/exp", "/regex/", "fuzzy~2", "wil*card",
            "wil?card", "prefi*", "single'quote"

    };

    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newTextField(FIELD, docs[i], Field.Store.YES));
        writer.addDocument(doc);
    }
    reader = writer.getReader();
    searcher = newSearcher(reader);
    writer.close();
}