uk.co.flax.luwak.analysis.TestSuffixingNGramTokenizer.java Source code

Introduction

Here is the source code for uk.co.flax.luwak.analysis.TestSuffixingNGramTokenizer.java
Source

package uk.co.flax.luwak.analysis;

import java.io.File;
import java.io.IOException;

import com.google.common.base.Charsets;
import com.google.common.io.Files;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.LeafReader;
import org.junit.Test;
import uk.co.flax.luwak.DocumentBatch;
import uk.co.flax.luwak.InputDocument;

import static uk.co.flax.luwak.assertions.TokenStreamAssert.assertThat;

/*
 * Copyright (c) 2014 Lemur Consulting Ltd.
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class TestSuffixingNGramTokenizer {

    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new WhitespaceTokenizer();
            TokenStream sink = new SuffixingNGramTokenFilter(source, "XX", "ANY", 10);
            return new TokenStreamComponents(source, sink);
        }
    };

    @Test
    public void testTokensAreSuffixed() throws IOException {

        TokenStream ts = analyzer.tokenStream("f", "term");
        //TokenStreamUtils.dumpTokenStream(ts);
        assertThat(ts).nextEquals("term").nextEquals("termXX").nextEquals("terXX").nextEquals("teXX")
                .nextEquals("tXX").nextEquals("ermXX").nextEquals("erXX").nextEquals("eXX").nextEquals("rmXX")
                .nextEquals("rXX").nextEquals("mXX").nextEquals("XX").isExhausted();

    }

    @Test
    public void testRepeatedSuffixesAreNotEmitted() throws IOException {

        TokenStream ts = analyzer.tokenStream("f", "arm harm term");
        assertThat(ts).nextEquals("arm").nextEquals("armXX").nextEquals("arXX").nextEquals("aXX").nextEquals("rmXX")
                .nextEquals("rXX").nextEquals("mXX").nextEquals("XX").nextEquals("harm").nextEquals("harmXX")
                .nextEquals("harXX").nextEquals("haXX").nextEquals("hXX").nextEquals("term").nextEquals("termXX")
                .nextEquals("terXX").nextEquals("teXX").nextEquals("tXX").nextEquals("ermXX").nextEquals("erXX")
                .nextEquals("eXX").isExhausted();
    }

    @Test
    public void testRepeatedInfixesAreNotEmitted() throws IOException {

        TokenStream ts = analyzer.tokenStream("f", "alarm alas harm");
        assertThat(ts).nextEquals("alarm").nextEquals("alarmXX").nextEquals("alarXX").nextEquals("alaXX")
                .nextEquals("alXX").nextEquals("aXX").nextEquals("larmXX").nextEquals("larXX").nextEquals("laXX")
                .nextEquals("lXX").nextEquals("armXX").nextEquals("arXX").nextEquals("rmXX").nextEquals("rXX")
                .nextEquals("mXX").nextEquals("XX").nextEquals("alas").nextEquals("alasXX").nextEquals("lasXX")
                .nextEquals("asXX").nextEquals("sXX").nextEquals("harm").nextEquals("harmXX").nextEquals("harXX")
                .nextEquals("haXX").nextEquals("hXX").isExhausted();
    }

    @Test
    public void testLengthyTokensAreNotNgrammed() throws IOException {

        TokenStream ts = analyzer.tokenStream("f", "alongtermthatshouldntbengrammed");
        assertThat(ts).nextEquals("alongtermthatshouldntbengrammed").nextEquals("ANY").isExhausted();

    }

    public static void main(String... args) throws IOException {

        String text = Files.toString(new File("src/test/resources/gutenberg/README"), Charsets.UTF_8);
        DocumentBatch batch = DocumentBatch
                .of(InputDocument.builder("1").addField("f", text, new StandardAnalyzer()).build());

        for (int i = 0; i < 50; i++) {

            long time = System.currentTimeMillis();

            // Cannot use try-with-resources here as we assign to ts in the block.
            LeafReader reader = batch.getIndexReader();
            TokenStream ts = new TermsEnumTokenStream(reader.fields().terms("f").iterator());
            try {
                ts = new SuffixingNGramTokenFilter(ts, "XX", "__WILDCARD__", 20);
                //ts = new DuplicateRemovalTokenFilter(ts);
                int tokencount = 0;
                ts.reset();
                while (ts.incrementToken()) {
                    tokencount++;
                }

                System.out.println(tokencount + " tokens in " + (System.currentTimeMillis() - time) + " ms");
            } finally {
                ts.close();
            }
        }

    }

}