List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java
License:Open Source License
public List<String> parseTerms(final String text, final Analyzer analyzer) { final List<String> terms = new LinkedList<String>(); try {// ww w . ja v a2 s .c om final TokenStream tokenStream = analyzer.tokenStream(null, text); tokenStream.reset(); final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { final String term = attribute.toString(); terms.add(term); } tokenStream.end(); tokenStream.close(); } catch (final IOException e) { // Cannot be thrown due to usage of a StringReader } return terms; }
From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java
License:Apache License
/** * @param contentText input text to be parsed into terms * @return salient terms in order of appearance * (or null if this content should be ignored) *//*from w w w. ja va2 s .c om*/ public List<String> getTermList(String contentText) { init(); List<String> result = new ArrayList<String>(contentText.length() / 10); try { TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); // Here we skip runs of position increment markers created // by the ShingleFilter for stop words because they skew // the clustering/liblinear analysis. if (!term.matches("(_ )*_")) { result.add(term); } } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:com.searchbox.SuggeterDataStructureBuilder.java
License:Apache License
private String[] getTokens(String fulltext) { LinkedList<String> tokens = new LinkedList<String>(); try {// w w w . j av a 2s . c om TokenStream tokenStream = analyzer.tokenStream(fields[0], new StringReader(fulltext)); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String token = charTermAttribute.toString(); tokens.add(token); } } catch (IOException ex) { LOGGER.error("Failure reading tokens from stream", ex); } return tokens.toArray(new String[0]); }
From source file:com.searchcode.app.util.CodeAnalyzer.java
License:Open Source License
public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of the TokenStream API"; CodeAnalyzer analyzer = new CodeAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the CharTermAttribute from the TokenStream CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); try {//ww w . ja v a 2s. c o m stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.toString()); } stream.end(); } finally { stream.close(); } }
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);//ww w .j a v a2s .c o m System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.annots.filter.AnnotatorTokenFilterTest.java
License:Apache License
private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException { ts.reset(); final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); int pos = -1; for (final TokenInfo info : infos) { assertThat(ts.incrementToken()).isTrue(); pos += posIncrAtt.getPositionIncrement(); assertThat(new TokenInfo(term.toString(), pos)).isEqualTo(info); }/*from w w w.j ava 2 s.co m*/ assertThat(ts.incrementToken()).isFalse(); }
From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java
License:Apache License
private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException { ts.reset(); final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); final ByteArrayDataInput in = new ByteArrayDataInput(); int pos = -1; for (final TokenInfo info : infos) { assertThat(ts.incrementToken()).isTrue(); pos += posIncrAtt.getPositionIncrement(); int len = -1; final BytesRef payload = payloadAtt.getPayload(); if (info.len != -1) { assertThat(payload).isNotNull(); in.reset(payload.bytes);// w w w .ja va2 s . com len = in.readVInt(); } else { assertThat(payload).isNull(); } assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info); } assertThat(ts.incrementToken()).isFalse(); }
From source file:com.shaie.LemmatizingTokenizerDemo.java
License:Apache License
private static void printTokens(TokenStream tokenStream) throws IOException { tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println(tokenStream); }// ww w .j a va 2 s . com }
From source file:com.shaie.SynonymFilterExample.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymGraphFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset(); int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); }/* w w w . ja va2 s. c om*/ ts.end(); ts.close(); }
From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java
License:Open Source License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode, final int[] expectedPos) throws Exception { final TokenStream t = a.tokenStream("", new StringReader(input)); t.reset(); assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null;//w w w .j a v a2 s.com if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } NodeAttribute nodeAtt = null; if (expectedNode != null) { assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class)); nodeAtt = t.getAttribute(NodeAttribute.class); } PositionAttribute posAtt = null; if (expectedPos != null) { assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class)); posAtt = t.getAttribute(PositionAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals("i=" + i, expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedNode != null) { assertEquals(expectedNode[i], nodeAtt.node()); } if (expectedPos != null) { assertEquals(expectedPos[i], posAtt.position()); } } assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken()); t.end(); t.close(); }