List of usage examples for org.apache.lucene.analysis TokenStream TokenStream
protected TokenStream()
From source file:axiom.objectmodel.dom.ReferenceAnalyzer.java
License:Open Source License
public TokenStream tokenStream(String fieldName, final Reader reader) { return new TokenStream() { private boolean done = false; private static final String DELIM = LuceneManager.NULL_DELIM; public Token next() throws IOException { if (!done) { done = true;// ww w .j a va 2s .co m final char[] buffer = new char[512]; StringBuffer sb = new StringBuffer(); int length = 0; while ((sb.indexOf(DELIM) < 0) && (length = reader.read(buffer)) != -1) { sb.append(buffer, 0, length); } final String value = sb.toString(); final int index = value.indexOf(DELIM); if (index < 0) { return null; } else { final String text = value.substring(0, index); return new Token(text, 0, text.length()); } } return null; } }; }
From source file:com.duroty.lucene.analysis.KeywordAnalyzer.java
License:Apache License
/** * DOCUMENT ME!//from w w w . j a v a 2 s. c o m * * @param fieldName DOCUMENT ME! * @param reader DOCUMENT ME! * * @return DOCUMENT ME! */ public TokenStream tokenStream(String fieldName, final Reader reader) { return new TokenStream() { private boolean done; private final char[] buffer = new char[1024]; public Token next() throws IOException { if (!done) { done = true; StringBuffer buffer = new StringBuffer(); int length = 0; while (true) { length = reader.read(this.buffer); if (length == -1) { break; } buffer.append(this.buffer, 0, length); } String text = buffer.toString(); return new Token(text, 0, text.length()); } return null; } }; }
From source file:com.shaie.PhraseVsSpanQuery.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Directory dir = new RAMDirectory(); final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); final IndexWriter writer = new IndexWriter(dir, conf); final Document doc = new Document(); doc.add(new TextField("f", new TokenStream() { final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class); final CharTermAttribute term = addAttribute(CharTermAttribute.class); boolean first = true, done = false; @Override/*from w ww. ja v a 2s . c om*/ public boolean incrementToken() throws IOException { if (done) { return false; } if (first) { term.setEmpty().append("a"); pos.setPositionIncrement(1); first = false; } else { term.setEmpty().append("b"); pos.setPositionIncrement(0); done = true; } return true; } })); writer.addDocument(doc); writer.close(); final DirectoryReader reader = DirectoryReader.open(dir); final IndexSearcher searcher = new IndexSearcher(reader); final LeafReader ar = reader.leaves().get(0).reader(); final TermsEnum te = ar.terms("f").iterator(); BytesRef scratch = new BytesRef(); while ((scratch = te.next()) != null) { System.out.println(scratch.utf8ToString()); final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString())); System.out.println(" doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition()); } System.out.println(); // try a phrase query with a slop final PhraseQuery pqNoSlop = buildPhraseQuery(0); System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits); final PhraseQuery pqSlop1 = buildPhraseQuery(1); System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits); final PhraseQuery pqSlop3 = buildPhraseQuery(3); System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits); final SpanNearQuery snqUnOrdered = new SpanNearQuery( new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1, false); System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = " + searcher.search(snqUnOrdered, 10).totalHits); final SpanNearQuery snqOrdered = new SpanNearQuery( new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1, true); System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = " + searcher.search(snqOrdered, 10).totalHits); reader.close(); }
From source file:org.apache.solr.analysis.TestRemoveDuplicatesTokenFilter.java
License:Apache License
public void testDups(final String expected, final Token... tokens) throws Exception { final Iterator<Token> toks = Arrays.asList(tokens).iterator(); final TokenStream ts = new RemoveDuplicatesTokenFilter(new TokenStream() { public Token next() { return toks.hasNext() ? toks.next() : null; }/*from www . j a va 2 s .co m*/ }); final String actual = TestBufferedTokenStream.tsToString(ts); assertEquals(expected + " != " + actual, expected, actual); }
From source file:org.apache.solr.analysis.TestRemoveDuplicatesTokenFilterFactory.java
License:Apache License
public void testDups(final String expected, final Token... tokens) throws Exception { final Iterator<Token> toks = Arrays.asList(tokens).iterator(); RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory(); final TokenStream ts = factory.create(new TokenStream() { CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); public boolean incrementToken() { if (toks.hasNext()) { clearAttributes();/* w ww .j a v a2 s . c o m*/ Token tok = toks.next(); termAtt.setEmpty().append(tok); offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); posIncAtt.setPositionIncrement(tok.getPositionIncrement()); return true; } else { return false; } } }); assertTokenStreamContents(ts, expected.split("\\s")); }
From source file:org.apache.solr.analysis.TestSynonymFilter.java
License:Apache License
public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException { ArrayList<Token> lst = new ArrayList<Token>(); final List toks = tokens(input); TokenStream ts = new TokenStream() { Iterator iter = toks.iterator(); @Override//from w w w .j a v a 2 s . co m public Token next() throws IOException { return iter.hasNext() ? (Token) iter.next() : null; } }; SynonymFilter sf = new SynonymFilter(ts, dict); Token target = new Token(); // test with token reuse while (true) { Token t = sf.next(target); if (t == null) return lst; lst.add((Token) t.clone()); } }
From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java
License:Apache License
/*** public void testPerformance() throws IOException { String s = "now is the time-for all good men to come to-the aid of their country."; Token tok = new Token();//ww w . j a va 2s . c o m long start = System.currentTimeMillis(); int ret=0; for (int i=0; i<1000000; i++) { StringReader r = new StringReader(s); TokenStream ts = new WhitespaceTokenizer(r); ts = new WordDelimiterFilter(ts, 1,1,1,1,0); while (ts.next(tok) != null) ret++; } System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start)); } ***/ public void testOffsets() throws IOException { // test that subwords and catenated subwords have // the correct offsets. WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { Token t; public Token next() throws IOException { if (t != null) return null; t = new Token("foo-bar", 5, 12); // actual return t; } }, 1, 1, 0, 0, 1, 1, 0); int i = 0; for (Token t; (t = wdf.next()) != null;) { String termText = new String(t.termBuffer(), 0, t.termLength()); if (termText.equals("foo")) { assertEquals(5, t.startOffset()); assertEquals(8, t.endOffset()); i++; } if (termText.equals("bar")) { assertEquals(9, t.startOffset()); assertEquals(12, t.endOffset()); i++; } if (termText.equals("foobar")) { assertEquals(5, t.startOffset()); assertEquals(12, t.endOffset()); i++; } } assertEquals(3, i); // make sure all 3 tokens were generated // test that if splitting or catenating a synonym, that the offsets // are not altered (they would be incorrect). wdf = new WordDelimiterFilter(new TokenStream() { Token t; public Token next() throws IOException { if (t != null) return null; t = new Token("foo-bar", 5, 6); // a synonym return t; } }, 1, 1, 0, 0, 1, 1, 0); for (Token t; (t = wdf.next()) != null;) { assertEquals(5, t.startOffset()); assertEquals(6, t.endOffset()); } }
From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java
License:Apache License
public void testOffsetChange() throws Exception { WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { Token t;/*ww w . ja va 2s . com*/ public Token next() { if (t != null) return null; t = new Token("belkeit)", 7, 16); return t; } }, 1, 1, 0, 0, 1, 1, 0); Token t = wdf.next(); assertNotNull(t); assertEquals("belkeit", t.term()); assertEquals(7, t.startOffset()); assertEquals(15, t.endOffset()); }
From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java
License:Apache License
public void testOffsetChange2() throws Exception { WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { Token t;/*from w w w . j a v a2s . c o m*/ public Token next() { if (t != null) return null; t = new Token("(belkeit", 7, 17); return t; } }, 1, 1, 0, 0, 1, 1, 0); Token t = wdf.next(); assertNotNull(t); assertEquals("belkeit", t.term()); assertEquals(8, t.startOffset()); assertEquals(17, t.endOffset()); }
From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java
License:Apache License
public void testOffsetChange3() throws Exception { WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { Token t;/* w w w . ja v a 2 s . com*/ public Token next() { if (t != null) return null; t = new Token("(belkeit", 7, 16); return t; } }, 1, 1, 0, 0, 1, 1, 0); Token t = wdf.next(); assertNotNull(t); assertEquals("belkeit", t.term()); assertEquals(8, t.startOffset()); assertEquals(16, t.endOffset()); }