Example usage for org.apache.lucene.analysis Tokenizer setReader

List of usage examples for org.apache.lucene.analysis Tokenizer setReader

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Tokenizer setReader.

Prototype

public final void setReader(Reader input) 

Source Link

Document

Expert: Set a new reader on the Tokenizer.

Usage

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testShortSentence() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader(" ? ?"), 2);
    assertEquals(/* www.j  av a 2s .c om*/
            ":N:NNG:null:1:1:0:1,:N:NNG:null:1:1:1:3,"
                    + ":N:NNG:null:1:1:4:5,?:COMPOUND:Compound:null:0:2:4:7,"
                    + "?:N:NNG:null:1:1:5:7,?:N:NNG:null:1:1:8:12,",
            tokenizerToString(tokenizer));

    tokenizer.reset();
    tokenizer.setReader(new StringReader(" ?? ."));
    assertEquals(":N:NNG:null:1:1:0:2,?:N:NNG:null:1:1:3:5,"
            + "?:COMPOUND:Compound:null:0:2:3:6,:N:NNG:null:1:1:5:6,"
            + "?:EOJEOL:NNG+JKS:null:1:1:6:8,:N:NNG:null:0:1:6:7,"
            + ":EOJEOL:VV+EP+EF:null:1:1:9:14,", tokenizerToString(tokenizer));
    tokenizer.close();
}

From source file:com.grantingersoll.opengrok.analysis.TestSymbolTokenizerFactories.java

License:Open Source License

private void assertTokenization(SymbolTokenizerFactory factory, String input, String[] output)
        throws Exception {
    Reader reader = new StringReader(input);
    Tokenizer stream = factory.create(newAttributeFactory());
    stream.setReader(reader);
    assertTokenStreamContents(stream, output);
}

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/*from  ww  w .  j av  a  2s.c om*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);//  ww  w.  ja  v  a2s .  c  o m

    writer.close();

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.example.AnnotatorAnyExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()));
    final TokenStream animalsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);// ww w  . ja  v  a  2  s  . co m
}

From source file:com.shaie.annots.example.AnnotatorTeeSinkFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            ColorAnnotator.withDefaultColors());
    final TokenStream animalsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            AnimalAnnotator.withDefaultAnimals());

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);//from ww  w .ja  v a2s.  com
}

From source file:com.shaie.annots.example.PreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new PreAnnotatedTokenFilter(textStream.newSinkTokenStream(),
            colorAnnotations);//from  w ww .j a v a2  s  .c  o  m

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.SimplePreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new SimplePreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);/* w w w . j a  v a 2  s  . com*/
}

From source file:com.shaie.SynonymFilterExample.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymGraphFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();/*from  ww  w  . j  av  a 2  s  .  c  o m*/
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}

From source file:com.sindicetech.siren.analysis.filter.TestMailtoFilter.java

License:Open Source License

private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems,
        final String[] expectedTypes, final int[] expectedPosIncr) throws IOException {
    assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
    final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class);

    assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
    final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);

    t.setReader(new StringReader(uri));
    t.reset();//from   w  w w .j  ava2 s  .  c  o  m

    final TokenFilter filter = new MailtoFilter(t);
    for (int i = 0; i < expectedStems.length; i++) {
        assertTrue("token " + i + " exists", filter.incrementToken());
        assertEquals(expectedStems[i], termAtt.toString());
        if (expectedTypes == null)
            assertEquals(uritype, typeAtt.type());
        else
            assertEquals(expectedTypes[i], typeAtt.type());
        if (expectedPosIncr != null)
            assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement());
    }
    filter.end();
    filter.close();
}