List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:com.romeikat.datamessie.core.base.util.ParseUtil.java
License:Open Source License
public List<String> parseTerms(final String text, final Analyzer analyzer) { final List<String> terms = new LinkedList<String>(); try {// ww w. j av a 2s . com final TokenStream tokenStream = analyzer.tokenStream(null, text); tokenStream.reset(); final Attribute attribute = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { final String term = attribute.toString(); terms.add(term); } tokenStream.end(); tokenStream.close(); } catch (final IOException e) { // Cannot be thrown due to usage of a StringReader } return terms; }
From source file:com.scaleunlimited.classify.analyzer.LuceneAnalyzer.java
License:Apache License
/** * @param contentText input text to be parsed into terms * @return salient terms in order of appearance * (or null if this content should be ignored) *//*from w w w . j a va2s . com*/ public List<String> getTermList(String contentText) { init(); List<String> result = new ArrayList<String>(contentText.length() / 10); try { TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); // Here we skip runs of position increment markers created // by the ShingleFilter for stop words because they skew // the clustering/liblinear analysis. if (!term.matches("(_ )*_")) { result.add(term); } } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:com.searchcode.app.util.CodeAnalyzer.java
License:Open Source License
public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of the TokenStream API"; CodeAnalyzer analyzer = new CodeAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the CharTermAttribute from the TokenStream CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); try {// w w w. ja v a2 s .c o m stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.toString()); } stream.end(); } finally { stream.close(); } }
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);/*from ww w . j av a 2 s. c o m*/ System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.SynonymFilterExample.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymGraphFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset();// www.j a va 2 s .com int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
From source file:com.sindicetech.siren.analysis.filter.TestURINormalisationFilter.java
License:Open Source License
public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages, final String[] expectedTypes) throws Exception { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null;// w w w . j av a 2 s. c o m if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } t.setReader(new StringReader(input)); t.reset(); final TokenStream filter = new URINormalisationFilter(t); for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } } assertFalse("end of stream", filter.incrementToken()); filter.end(); filter.close(); }
From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java
License:Open Source License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode, final int[] expectedPos) throws Exception { final TokenStream t = a.tokenStream("", new StringReader(input)); t.reset();/*from ww w. j a va2 s . c o m*/ assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null; if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } NodeAttribute nodeAtt = null; if (expectedNode != null) { assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class)); nodeAtt = t.getAttribute(NodeAttribute.class); } PositionAttribute posAtt = null; if (expectedPos != null) { assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class)); posAtt = t.getAttribute(PositionAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals("i=" + i, expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedNode != null) { assertEquals(expectedNode[i], nodeAtt.node()); } if (expectedPos != null) { assertEquals(expectedPos[i], posAtt.position()); } } assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken()); t.end(); t.close(); }
From source file:com.sindicetech.siren.analysis.TestConciseJsonAnalyzer.java
License:Open Source License
@Test public void testNumeric() throws Exception { _a.registerDatatype(XSDDatatype.XSD_LONG.toCharArray(), new LongNumericAnalyzer(64)); final TokenStream t = _a.tokenStream("", new StringReader("{ \"a\" : 12 }")); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.reset();// w ww . ja v a 2 s . c om assertTrue(t.incrementToken()); assertTrue(termAtt.toString().startsWith("a:")); t.end(); t.close(); }
From source file:com.sindicetech.siren.solr.analysis.BaseSirenStreamTestCase.java
License:Open Source License
public void assertTokenStreamContents(final TokenStream stream, final String[] expectedImages) throws Exception { assertTrue("has TermAttribute", stream.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); stream.reset();//from w ww . ja v a 2s. c om for (int i = 0; i < expectedImages.length; i++) { stream.clearAttributes(); assertTrue("token " + i + " does not exists", stream.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); } assertFalse("end of stream", stream.incrementToken()); stream.end(); stream.close(); }
From source file:com.stratio.cassandra.index.query.Condition.java
License:Apache License
protected String analyze(String field, String value, ColumnMapper<?> columnMapper) { TokenStream source = null; try {/*from w ww. ja v a 2 s.c o m*/ Analyzer analyzer = columnMapper.analyzer(); source = analyzer.tokenStream(field, value); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) { return null; } termAtt.fillBytesRef(); if (source.incrementToken()) { throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value); } source.end(); return BytesRef.deepCopyOf(bytes).utf8ToString(); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + value, e); } finally { IOUtils.closeWhileHandlingException(source); } }