Example usage for org.apache.lucene.analysis MockTokenizer MockTokenizer

List of usage examples for org.apache.lucene.analysis MockTokenizer MockTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis MockTokenizer MockTokenizer.

Prototype

public MockTokenizer() 

Source Link

Document

Calls #MockTokenizer(CharacterRunAutomaton,boolean) MockTokenizer(Reader, WHITESPACE, true)

Usage

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

public void test() throws Exception {
    Directory dir = newDirectory();//from ww  w.j a v  a  2 s  .  c o  m
    Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            if (fieldName.contains("payloadsFixed")) {
                TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1);
                return new TokenStreamComponents(tokenizer, filter);
            } else if (fieldName.contains("payloadsVariable")) {
                TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer);
                return new TokenStreamComponents(tokenizer, filter);
            } else {
                return new TokenStreamComponents(tokenizer);
            }
        }
    };
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat()));
    // TODO we could actually add more fields implemented with different PFs
    // or, just put this test into the usual rotation?
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsOnlyType.setStoreTermVectors(true);
    docsOnlyType.setIndexOptions(IndexOptions.DOCS);

    FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn this on for a cross-check
    docsAndFreqsType.setStoreTermVectors(true);
    docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

    FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
    // turn these on for a cross-check
    positionsType.setStoreTermVectors(true);
    positionsType.setStoreTermVectorPositions(true);
    positionsType.setStoreTermVectorOffsets(true);
    positionsType.setStoreTermVectorPayloads(true);
    FieldType offsetsType = new FieldType(positionsType);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    Field field1 = new Field("field1docs", "", docsOnlyType);
    Field field2 = new Field("field2freqs", "", docsAndFreqsType);
    Field field3 = new Field("field3positions", "", positionsType);
    Field field4 = new Field("field4offsets", "", offsetsType);
    Field field5 = new Field("field5payloadsFixed", "", positionsType);
    Field field6 = new Field("field6payloadsVariable", "", positionsType);
    Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
    Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
    doc.add(field1);
    doc.add(field2);
    doc.add(field3);
    doc.add(field4);
    doc.add(field5);
    doc.add(field6);
    doc.add(field7);
    doc.add(field8);
    for (int i = 0; i < MAXDOC; i++) {
        String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ')
                + " " + TestUtil.randomSimpleString(random());
        field1.setStringValue(stringValue);
        field2.setStringValue(stringValue);
        field3.setStringValue(stringValue);
        field4.setStringValue(stringValue);
        field5.setStringValue(stringValue);
        field6.setStringValue(stringValue);
        field7.setStringValue(stringValue);
        field8.setStringValue(stringValue);
        iw.addDocument(doc);
    }
    iw.close();
    verify(dir);
    TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge
    iwc = newIndexWriterConfig(analyzer);
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat()));
    iwc.setOpenMode(OpenMode.APPEND);
    IndexWriter iw2 = new IndexWriter(dir, iwc);
    iw2.forceMerge(1);
    iw2.close();
    verify(dir);
    dir.close();
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java

License:Apache License

/**
 * Check that the deprecated name "nGram" issues a deprecation warning for indices created since 6.3.0
 *///w w  w. ja  va2 s  . co m
public void testNGramDeprecationWarning() throws IOException {
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
            .put(IndexMetaData.SETTING_VERSION_CREATED,
                    VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT))
            .build();

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
    try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
        Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings,
                commonAnalysisPlugin).tokenFilter;
        TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
        Tokenizer tokenizer = new MockTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        assertNotNull(tokenFilterFactory.create(tokenizer));
        assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
                + "Please change the filter name to [ngram] instead.");
    }
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java

License:Apache License

/**
 * Check that the deprecated name "nGram" does NOT issues a deprecation warning for indices created before 6.4.0
 *///w w  w  .j a va  2s  . c  o m
public void testNGramNoDeprecationWarningPre6_4() throws IOException {
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
            .put(IndexMetaData.SETTING_VERSION_CREATED,
                    VersionUtils.randomVersionBetween(random(), Version.V_5_0_0, Version.V_6_3_0))
            .build();

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
    try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
        Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings,
                commonAnalysisPlugin).tokenFilter;
        TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
        Tokenizer tokenizer = new MockTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        assertNotNull(tokenFilterFactory.create(tokenizer));
    }
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java

License:Apache License

/**
 * Check that the deprecated name "edgeNGram" issues a deprecation warning for indices created since 6.3.0
 *//* w w  w. j av a 2 s.  c om*/
public void testEdgeNGramDeprecationWarning() throws IOException {
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
            .put(IndexMetaData.SETTING_VERSION_CREATED,
                    VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT))
            .build();

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
    try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
        Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings,
                commonAnalysisPlugin).tokenFilter;
        TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
        Tokenizer tokenizer = new MockTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        assertNotNull(tokenFilterFactory.create(tokenizer));
        assertWarnings(
                "The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
                        + "Please change the filter name to [edge_ngram] instead.");
    }
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPluginTests.java

License:Apache License

/**
 * Check that the deprecated name "edgeNGram" does NOT issues a deprecation warning for indices created before 6.4.0
 *//*from  w  ww . j  ava2  s. c  om*/
public void testEdgeNGramNoDeprecationWarningPre6_4() throws IOException {
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
            .put(IndexMetaData.SETTING_VERSION_CREATED,
                    VersionUtils.randomVersionBetween(random(), Version.V_5_0_0, Version.V_6_3_0))
            .build();

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
    try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
        Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings,
                commonAnalysisPlugin).tokenFilter;
        TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
        Tokenizer tokenizer = new MockTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        assertNotNull(tokenFilterFactory.create(tokenizer));
    }
}

From source file:org.elasticsearch.analysis.common.NGramTokenizerFactoryTests.java

License:Apache License

public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
    int iters = scaledRandomIntBetween(20, 100);
    for (int i = 0; i < iters; i++) {
        final Index index = new Index("test", "_na_");
        final String name = "ngr";
        Version v = randomVersion(random());
        Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3);
        boolean reverse = random().nextBoolean();
        if (reverse) {
            builder.put("side", "back");
        }// w ww  .j  av a 2  s .c  om
        Settings settings = builder.build();
        Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id)
                .build();
        Tokenizer tokenizer = new MockTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(
                IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
                        .create(tokenizer);
        if (reverse) {
            assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class));
        } else {
            assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class));
        }
    }
}

From source file:org.elasticsearch.index.analysis.AnalysisPolishFactoryTests.java

License:Apache License

private void testThreadSafety(TokenFilterFactory factory) throws IOException {
    final Analyzer analyzer = new Analyzer() {
        @Override/*from  ww  w . j  av a  2s .  c om*/
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            return new TokenStreamComponents(tokenizer, factory.create(tokenizer));
        }
    };
    BaseTokenStreamTestCase.checkRandomData(random(), analyzer, 100);
}

From source file:org.elasticsearch.index.mapper.core.LegacyTokenCountFieldMapperTests.java

License:Apache License

public void testCountPositions() throws IOException {
    // We're looking to make sure that we:
    Token t1 = new Token(); // Don't count tokens without an increment
    t1.setPositionIncrement(0);/*from  ww w. ja  v  a 2s  .  c  o  m*/
    Token t2 = new Token();
    t2.setPositionIncrement(1); // Count normal tokens with one increment
    Token t3 = new Token();
    t2.setPositionIncrement(2); // Count funny tokens with more than one increment
    int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
    Token[] tokens = new Token[] { t1, t2, t3 };
    Collections.shuffle(Arrays.asList(tokens), random());
    final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
    // TODO: we have no CannedAnalyzer?
    Analyzer analyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new MockTokenizer(), tokenStream);
        }
    };
    assertThat(LegacyTokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
}

From source file:org.elasticsearch.index.mapper.TokenCountFieldMapperTests.java

License:Apache License

public void testCountPositions() throws IOException {
    // We're looking to make sure that we:
    Token t1 = new Token(); // Don't count tokens without an increment
    t1.setPositionIncrement(0);//from w  ww  . j a  va  2 s. c om
    Token t2 = new Token();
    t2.setPositionIncrement(1); // Count normal tokens with one increment
    Token t3 = new Token();
    t2.setPositionIncrement(2); // Count funny tokens with more than one increment
    int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
    Token[] tokens = new Token[] { t1, t2, t3 };
    Collections.shuffle(Arrays.asList(tokens), random());
    final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
    // TODO: we have no CannedAnalyzer?
    Analyzer analyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new MockTokenizer(), tokenStream);
        }
    };
    assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
}