List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java
License:Apache License
@Test public void baseUIMAPayloadsAnalyzerStreamTest() { try {/* w w w .j a v a2 s . c om*/ TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class); while (ts.incrementToken()) { assertNotNull(termAtt); assertNotNull(payloadAttribute); System.out.println("token '" + termAtt.toString() + "' has payload " + new String(payloadAttribute.getPayload().getData())); } } catch (Exception e) { e.printStackTrace(); fail(e.getLocalizedMessage()); } }
From source file:com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest.java
License:Apache License
@Test public void testSimpleUsage() { try {/*from w ww .j av a 2s. c o m*/ UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml", "org.apache.uima.TokenAnnotation", "posTag"); TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class); PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); while (ts.incrementToken()) { assertNotNull(offsetAtt); assertNotNull(termAtt); assertNotNull(posAtt); assertNotNull(typeAttr); System.out.println("token '" + termAtt.toString() + "' has type " + typeAttr.type()); } } catch (Exception e) { e.printStackTrace(); fail(e.getLocalizedMessage()); } }
From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java
License:Apache License
public void testCreateAnalyzerWrapper() throws IOException { PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper(); TokenStream tokenStream = null; CharTermAttribute charTermAttribute = null; List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1")); List<String> actualIdTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("id", "1"); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/* w w w . j a v a 2 s . c o m*/ while (tokenStream.incrementToken()) { actualIdTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedIdTermList, actualIdTermList); List<String> expectedTextTermList = new LinkedList<String>( Arrays.asList("lucene", "is", "a", "full", "text", "search", "library")); List<String> actualTextTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library."); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { actualTextTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedTextTermList, actualTextTermList); }
From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java
License:Apache License
public List<String> tokenize(String text) { List<String> words = new ArrayList<String>(); if (text != null && !text.isEmpty()) { TokenStream tokenStream = this.createTokenStream(text); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try {/*from w w w .ja v a2 s .co m*/ while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); words.add(term); } } catch (IOException ioe) { LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe); } finally { try { tokenStream.end(); tokenStream.close(); } catch (IOException e) { // Can't do nothing!! LOGGER.error("Unable to close token stream : " + e.getMessage()); } } } return words; }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException { ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>(); HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>(); addAttributes(tokenStream);// w w w .j av a2 s .c om tokenStream.reset(); while (tokenStream.incrementToken()) { if (tokenStream.hasAttributes()) { LuceneToken token = new LuceneToken(); readOffset(tokenStream, token); // Lucene may output multiple tokens for compound words LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset()); if (tokenWithSameStartOffset != null) { if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) { continue; } else { tokens.remove(tokenWithSameStartOffset); } } readReading(tokenStream, token); readPartOfSpeech(tokenStream, token); readInflection(tokenStream, token); readBaseForm(tokenStream, token); tokensByStartOffset.put(token.getStartOffset(), token); tokens.add(token); } } tokenStream.end(); tokenStream.close(); return tokens; }
From source file:com.github.rnewson.couchdb.lucene.util.AnalyzersTest.java
License:Apache License
private String[] analyze(final String analyzerName, final String text) throws Exception { final Analyzer analyzer = Analyzers.getAnalyzer(analyzerName); final TokenStream stream = analyzer.tokenStream("default", new StringReader(text)); stream.reset();//from w ww. j av a 2 s . c o m final List<String> result = new ArrayList<String>(); while (stream.incrementToken()) { final CharTermAttribute c = stream.getAttribute(CharTermAttribute.class); result.add(c.toString()); } return result.toArray(new String[0]); }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException { ArrayList<String> tokens = new ArrayList<String>(); TokenStream ts = analyzer.tokenStream(field, value); ts.reset();// w ww . j a v a2 s. co m while (ts.incrementToken()) { CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); String token = new String(termAttribute.buffer(), 0, termAttribute.length()); tokens.add(token); } ts.end(); ts.close(); return tokens; }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
public static org.apache.lucene.analysis.Token getNextToken(TokenStream input) throws IOException { org.apache.lucene.analysis.Token token = null; if (input.incrementToken()) { CharTermAttribute ccc = input.addAttribute(CharTermAttribute.class); Iterator<AttributeImpl> attIt = input.getAttributeImplsIterator(); if (attIt == null || !attIt.hasNext()) { return null; }/* w w w . j a va2s .c om*/ AttributeImpl att = attIt.next(); if (att instanceof GSAttributeImpl) { token = ((GSAttributeImpl) att).getToken(); } if (token == null && ccc != null && ccc.length() > 0) { String ttt = ccc.toString(); token = new org.apache.lucene.analysis.Token(ttt, 0, ttt.length()); } } return token; }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer.// w w w . j a v a 2 s. c om * * @param p_text fuzzy match format string * @return List of c.g.l.tm2.index.Tokens */ public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class); //org.apache.lucene.analysis.Token luceneToken = null; List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { // luceneToken = gsAtt.getToken(); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return buildTokenList(tokens); }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
/** * Create GlobalSight TM tokens from a provided segment string * using GsAnalyzer. This method is suitable for use with TM3 * fuzzy indices, and does two things differently than createGsTokens(): * 1) It returns tokens in the order in which they appear * 2) It does not collapse duplicate tokens (and correspondingly does * not return count information)/*w w w. j av a2s . c o m*/ * * @param p_text fuzzy match format string * @return List of Strings, each representing one token */ public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception { GsAnalyzer analyzer = new GsAnalyzer(p_locale); TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text)); tokenStream.reset(); List<String> tokens = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokens.add(termAtt.toString()); } tokenStream.close(); return tokens; }