Example usage for org.apache.lucene.analysis.core WhitespaceTokenizer WhitespaceTokenizer

List of usage examples for org.apache.lucene.analysis.core WhitespaceTokenizer WhitespaceTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core WhitespaceTokenizer WhitespaceTokenizer.

Prototype

public WhitespaceTokenizer() 

Source Link

Document

Construct a new WhitespaceTokenizer.

Usage

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/* www. j  a v a 2  s.  co m*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);/*from www .  j  av a 2s. co m*/

    writer.close();

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.example.AnnotatorAnyExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()));
    final TokenStream animalsStream = new AnyAnnotationTokenFilter(
            new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);//from   w w  w. j  a v a 2s .co m
}

From source file:com.shaie.annots.example.AnnotatorTeeSinkFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            ColorAnnotator.withDefaultColors());
    final TokenStream animalsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(),
            AnimalAnnotator.withDefaultAnimals());

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);/*from   ww w . ja va  2s  .  co  m*/
}

From source file:com.shaie.annots.example.PreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new PreAnnotatedTokenFilter(textStream.newSinkTokenStream(),
            colorAnnotations);//www.j  a v a2s. c  om

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);
}

From source file:com.shaie.annots.example.SimplePreAnnotatedTokenFilterExample.java

License:Apache License

@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(
            new SimplePreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    writer.addDocument(doc);// www  .  j a v  a 2s  . c o  m
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void does_not_return_any_token_if_no_accepted_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) {
        tok.setReader(new StringReader(ONE));
        assertTokenInfos(f);//from  w  w w .  j  a va2 s.c o  m
    }
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_accepted_token() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) {
        stubAnnotator(ONE);/*from  w  ww  . j  av a 2s . co m*/
        tok.setReader(new StringReader(ONE));
        assertTokenInfos(f, new TokenInfo(ONE, 0));
    }
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_all_accepted_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) {
        stubAnnotator(ONE, THREE);//w w w . j  a  va  2  s .  c  o m
        tok.setReader(new StringReader(ONE_TWO_THREE));
        assertTokenInfos(f, new TokenInfo(ONE, 0), new TokenInfo(THREE, 2));
    }
}

From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java

License:Apache License

@Test
public void returns_tokens_when_only_accepted_tokens() throws IOException {
    try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) {
        stubAnnotator(ONE, TWO);/*from  ww  w  .ja va2s.c o  m*/
        tok.setReader(new StringReader(ONE_TWO));
        assertTokenInfos(f, new TokenInfo(ONE, 0), new TokenInfo(TWO, 1));
    }
}