List of usage examples for org.apache.lucene.analysis Tokenizer setReader
public final void setReader(Reader input)
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testShortSentence() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader(" ? ?"), 2); assertEquals(/* www.j av a 2s .c om*/ ":N:NNG:null:1:1:0:1,:N:NNG:null:1:1:1:3," + ":N:NNG:null:1:1:4:5,?:COMPOUND:Compound:null:0:2:4:7," + "?:N:NNG:null:1:1:5:7,?:N:NNG:null:1:1:8:12,", tokenizerToString(tokenizer)); tokenizer.reset(); tokenizer.setReader(new StringReader(" ?? .")); assertEquals(":N:NNG:null:1:1:0:2,?:N:NNG:null:1:1:3:5," + "?:COMPOUND:Compound:null:0:2:3:6,:N:NNG:null:1:1:5:6," + "?:EOJEOL:NNG+JKS:null:1:1:6:8,:N:NNG:null:0:1:6:7," + ":EOJEOL:VV+EP+EF:null:1:1:9:14,", tokenizerToString(tokenizer)); tokenizer.close(); }
From source file:com.grantingersoll.opengrok.analysis.TestSymbolTokenizerFactories.java
License:Open Source License
private void assertTokenization(SymbolTokenizerFactory factory, String input, String[] output) throws Exception { Reader reader = new StringReader(input); Tokenizer stream = factory.create(newAttributeFactory()); stream.setReader(reader); assertTokenStreamContents(stream, output); }
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);/*from ww w . j av a 2s.c om*/ System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.annots.AnnotationSearchExample.java
License:Apache License
public static void main(String[] args) throws Exception { Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); IndexWriter writer = new IndexWriter(dir, conf); // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the // IndexWriterConfig. Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken")); TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); TokenStream colorAnnotationStream = new AnnotatingTokenFilter( textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM); Document doc = new Document(); doc.add(new TextField("text", textStream)); doc.add(new TextField("annot", colorAnnotationStream)); writer.addDocument(doc);// ww w. ja v a2s . c o m writer.close(); DirectoryReader reader = DirectoryReader.open(dir); LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment printFieldTerms(ar, "text"); System.out.println(); final ByteArrayDataInput in = new ByteArrayDataInput(); PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM)); int docID = dape.nextDoc(); int freq = dape.freq(); System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { dape.nextPosition(); BytesRef payload = dape.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } IndexSearcher searcher = new IndexSearcher(reader); System.out.println("\nsearching for 'red WITHIN color':"); Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "red"))); TopDocs td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); System.out.println("\nsearching for 'ate WITHIN color':"); q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "ate"))); td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); reader.close(); dir.close(); }
From source file:com.shaie.annots.example.AnnotatorAnyExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnyAnnotationTokenFilter( new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors())); final TokenStream animalsStream = new AnyAnnotationTokenFilter( new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals())); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc);// ww w . ja v a 2 s . co m }
From source file:com.shaie.annots.example.AnnotatorTeeSinkFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()); final TokenStream animalsStream = new AnnotatorTokenFilter(textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); doc.add(new TextField(ANIMAL_FIELD, animalsStream)); writer.addDocument(doc);//from ww w .ja v a2s. com }
From source file:com.shaie.annots.example.PreAnnotatedTokenFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new PreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations);//from w ww .j a v a2 s .c o m final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); writer.addDocument(doc); }
From source file:com.shaie.annots.example.SimplePreAnnotatedTokenFilterExample.java
License:Apache License
@SuppressWarnings("resource") private static void addDocument(IndexWriter writer, String text, int... colorAnnotations) throws IOException { final Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); final TokenStream colorsStream = new AnyAnnotationTokenFilter( new SimplePreAnnotatedTokenFilter(textStream.newSinkTokenStream(), colorAnnotations)); final Document doc = new Document(); doc.add(new StoredField(TEXT_FIELD, text)); doc.add(new TextField(TEXT_FIELD, textStream)); doc.add(new TextField(COLOR_FIELD, colorsStream)); writer.addDocument(doc);/* w w w . j a v a 2 s . com*/ }
From source file:com.shaie.SynonymFilterExample.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymGraphFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset();/*from ww w . j av a 2 s . c o m*/ int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
From source file:com.sindicetech.siren.analysis.filter.TestMailtoFilter.java
License:Open Source License
private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr) throws IOException { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class); assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); t.setReader(new StringReader(uri)); t.reset();//from w w w .j ava2 s . c o m final TokenFilter filter = new MailtoFilter(t); for (int i = 0; i < expectedStems.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedStems[i], termAtt.toString()); if (expectedTypes == null) assertEquals(uritype, typeAtt.type()); else assertEquals(expectedTypes[i], typeAtt.type()); if (expectedPosIncr != null) assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement()); } filter.end(); filter.close(); }