Example usage for org.apache.lucene.analysis MockTokenizer WHITESPACE

List of usage examples for org.apache.lucene.analysis MockTokenizer WHITESPACE

Introduction

In this page you can find the example usage for org.apache.lucene.analysis MockTokenizer WHITESPACE.

Prototype

CharacterRunAutomaton WHITESPACE

To view the source code for org.apache.lucene.analysis MockTokenizer WHITESPACE.

Click Source Link

Document

Acts Similar to WhitespaceTokenizer

Usage

From source file:brightsolid.solr.plugins.TestTargetPositionQuerySynonyms.java

License:Apache License

@Override
public void setUp() throws Exception {
    super.setUp();

    String testFile = "one, uno, un\n" + "two, dos, too\n" + "three, free, tres";

    SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(random()));
    parser.parse(new StringReader(testFile));

    final SynonymMap map = parser.build();
    Analyzer analyzer = new Analyzer() {
        @Override//  www  .  ja  va  2 s . c om
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true);
            return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
        }
    };

    directory = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), directory, analyzer);
    Document doc = new Document();
    FieldType newType = new FieldType(org.apache.lucene.document.TextField.TYPE_STORED);
    newType.setOmitNorms(true);
    Field field = newField("field", "", newType);
    field.fieldType().setOmitNorms(true);

    doc.add(field);

    field.setStringValue("one two three");
    iw.addDocument(doc);
    field.setStringValue("two three one");
    iw.addDocument(doc);
    field.setStringValue("three one two");
    iw.addDocument(doc);

    reader = iw.getReader();
    iw.close();
    searcher = newSearcher(reader);
}

From source file:edu.stanford.lucene.analysis.TestCJKFoldingFilter.java

License:Open Source License

@Test
public void testNonCJKtokens() throws Exception {
    TokenStream stream = new MockTokenizer(new StringReader("Des mot"), MockTokenizer.WHITESPACE, false);
    CJKFoldingFilter filter = new CJKFoldingFilter(stream);

    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    filter.reset();//from w  w  w .  j a  va  2 s. c om
    assertTermEquals("Des", filter, termAtt);
    assertTermEquals("mot", filter, termAtt);
    assertFalse(filter.incrementToken());
}

From source file:gov.nih.nlm.ncbi.seqr.tokenizer.TestTokenSizeFilter.java

License:Apache License

public void testTokenSizeFilter() throws IOException {

    TokenStream stream = new MockTokenizer(new StringReader("short too long"), MockTokenizer.WHITESPACE, false);
    TokenSizeFilter filter = new TokenSizeFilter(stream);
    assertTokenStreamContents(filter, new String[] { "3" });
}

From source file:info.freelibrary.solr.ISO639ConversionFilterTest.java

License:Apache License

/**
 * Does the work of the test using a {@link MockTokenizer}.
 * //from  w  w  w. ja  v  a2 s.c om
 * @param aInput The ISO-639 two or three digit code
 * @param aExpected The expected results
 * @throws Exception If there is trouble tokenizing or converting
 */
static void assertConvertsTo(String aInput, String[] aExpected) throws Exception {
    Tokenizer tokenizer = new MockTokenizer(new StringReader(aInput), MockTokenizer.WHITESPACE, false);
    ISO639ConversionFilter filter = new ISO639ConversionFilter(tokenizer);
    assertTokenStreamContents(filter, aExpected);

    // TODO: Do we want to test with other tokenizers? Using WHITESPACE.

    /*
     * Source documentation: http://lucene.apache.org/core/3_6_2/api/test-framework/org/apache/lucene/analysis/MockTokenizer.html
     * http://lucene.apache.org/core/4_2_0/test-framework/org/apache/lucene/analysis/MockTokenizer.html
     * http://lucene.apache.org/core/4_4_0/test-framework/org/apache/lucene/analysis/MockTokenizer.html
     */
}

From source file:org.apache.solr.analysis.entity.TestEntityFilter.java

License:Apache License

public void testNumericTokensFromFilter() throws Exception {
    String input = "N/2/6/2/10 N/8/57/5/20549 N/5/42/2/13";
    TokenStream ts = new EntityFilter(
            new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
            Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "N", false);
    assertTokenStreamContents(ts, new String[] { "10", "20549", "13" });
}

From source file:org.apache.solr.analysis.entity.TestEntityFilter.java

License:Apache License

public void testNumericTokensFromFilterWithSort() throws Exception {
    String input = "N/2/6/2/10 N/8/57/5/20549 N/5/42/2/13";
    TokenStream ts = new EntityFilter(
            new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
            Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "N", true);
    assertTokenStreamContents(ts, new String[] { NumberUtils.double2sortableStr(10),
            NumberUtils.double2sortableStr(20549), NumberUtils.double2sortableStr(13) });
}

From source file:org.apache.solr.analysis.entity.TestEntityFilter.java

License:Apache License

public void testMoneyTokensFromFilter() throws Exception {
    String input = "M/2/4/2/15 M/8/47/6/25.00 M/10/48/13/9.24E7";
    TokenStream ts = new EntityFilter(
            new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
            Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "M", false);
    assertTokenStreamContents(ts, new String[] { "15", "25.00", "9.24E7" });
}

From source file:org.apache.solr.analysis.entity.TestEntityFilter.java

License:Apache License

public void testPercentageTokensFromFilter() throws Exception {
    String input = "P/4/23/4/100 P/6/19/5/2.96 P/1/1/1/-1";
    TokenStream ts = new EntityFilter(
            new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
            Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "P", true);
    assertTokenStreamContents(ts, new String[] { NumberUtils.double2sortableStr(100),
            NumberUtils.double2sortableStr(2.96), NumberUtils.double2sortableStr(-1) });
}

From source file:org.apache.solr.analysis.entity.TestEntityFilter.java

License:Apache License

public void testDateTokensFromFilter() throws Exception {
    String input = " D/8/39/4/1934-01-01 D/4/5/17/2013-12 D/4/5/17/2013 D/4/19/8/XXXX-03-01 D/4/19/8/XXXX-03 D/4/19/8/XXXX-XX-01";
    SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
    SimpleDateFormat df1 = new SimpleDateFormat("yyyy-MM");
    SimpleDateFormat df2 = new SimpleDateFormat("yyyy");
    TokenStream ts = new EntityFilter(
            new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
            Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "D", true);
    Long l1 = df.parse("1934-01-01").getTime();
    Long l2 = df1.parse("2013-12").getTime();
    Long l3 = df2.parse("2013").getTime();
    Long l4 = df.parse("1400-03-01").getTime();
    Long l5 = df1.parse("1400-03").getTime();
    Long l6 = df.parse("1400-01-01").getTime();
    assertTokenStreamContents(ts,//  w  w w  . ja  v  a 2s.  c om
            new String[] { NumberUtils.long2sortableStr(l1), NumberUtils.long2sortableStr(l2),
                    NumberUtils.long2sortableStr(l3), NumberUtils.long2sortableStr(l4),
                    NumberUtils.long2sortableStr(l5), NumberUtils.long2sortableStr(l6) });
}

From source file:org.apache.solr.analysis.entity.TestEntityFilter.java

License:Apache License

public void testRatioTokensFromFilter() throws Exception {
    String input = "R/5/29/5/10-00 R/22/143/5/4.5-1 R/26/123/5/12-01";
    TokenStream ts = new EntityFilter(
            new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false),
            Pattern.compile("(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)?"), "R", false);
    assertTokenStreamContents(ts, new String[] { "10-00", "4.5-1", "12-01" });
}