List of usage examples for org.apache.lucene.analysis.sinks TeeSinkTokenFilter TeeSinkTokenFilter
public TeeSinkTokenFilter(TokenStream input)
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);//from w w w. j a v a 2 s . c om System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.annots.AnnotationSearchExample.java
License:Apache License
public static void main(String[] args) throws Exception { Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); IndexWriter writer = new IndexWriter(dir, conf); // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the // IndexWriterConfig. Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken")); TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); TokenStream colorAnnotationStream = new AnnotatingTokenFilter( textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM); Document doc = new Document(); doc.add(new TextField("text", textStream)); doc.add(new TextField("annot", colorAnnotationStream)); writer.addDocument(doc);//w w w.j a v a2 s . c o m writer.close(); DirectoryReader reader = DirectoryReader.open(dir); LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment printFieldTerms(ar, "text"); System.out.println(); final ByteArrayDataInput in = new ByteArrayDataInput(); PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM)); int docID = dape.nextDoc(); int freq = dape.freq(); System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { dape.nextPosition(); BytesRef payload = dape.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } IndexSearcher searcher = new IndexSearcher(reader); System.out.println("\nsearching for 'red WITHIN color':"); Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "red"))); TopDocs td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); System.out.println("\nsearching for 'ate WITHIN color':"); q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "ate"))); td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); reader.close(); dir.close(); }
From source file:com.shaie.annots.example.AnnotatorAnyExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnyAnnotationTokenFilter( new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors())); final TokenStream animalsStream = new AnyAnnotationTokenFilter( new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals())); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc);//from w w w . j a v a2 s . c o m }
From source file:com.shaie.annots.example.AnnotatorTeeSinkFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()); final TokenStream animalsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc);/* w w w . j av a 2 s . c o m*/ }
From source file:com.shaie.annots.example.PreAnnotatedTokenFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new PreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations);//from w ww. j a v a2 s. c o m final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); writer.addDocument(doc); }
From source file:com.shaie.annots.example.SimplePreAnnotatedTokenFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnyAnnotationTokenFilter( new SimplePreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations)); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); writer.addDocument(doc);// w ww . j a v a2s .c o m }