List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:com.flaptor.indextank.query.IndexEngineParser.java
License:Apache License
public Iterator<AToken> parseDocumentField(String fieldName, String content) { final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content)); final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class); final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class); final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class); return new AbstractIterator<AToken>() { int currentPosition = 0; @Override/*from w ww. jav a 2s . c om*/ protected AToken computeNext() { try { if (!tkstream.incrementToken()) { tkstream.end(); tkstream.close(); return endOfData(); } } catch (IOException e) { //This should never happen, as the reader is a StringReader } //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class); currentPosition += posIncrAttribute.getPositionIncrement(); final int position = currentPosition; final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String text = termAtt.term(); return new AToken() { @Override public String getText() { return text; //luceneTk.term(); } @Override public int getPosition() { return position; //luceneTk.getPositionIncrement(); } @Override public int getStartOffset() { return startOffset; } @Override public int getEndOffset() { return endOffset; } }; } }; }
From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java
License:Apache License
/** * TODO: rewrite tests not to use string comparison. *///from ww w . ja va 2 s . c om private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); while (in.incrementToken()) { out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); out.append(' '); } if (out.length() > 0) out.deleteCharAt(out.length() - 1); in.close(); return out.toString(); }
From source file:com.github.le11.nls.lucene.UIMABaseAnalyzerTest.java
License:Apache License
@Test public void baseUIMAAnalyzerStreamTest() { try {/*from www . ja v a 2 s . c o m*/ TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { assertNotNull(offsetAtt); assertNotNull(termAtt); System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + "," + offsetAtt.endOffset()); } } catch (Exception e) { e.printStackTrace(); fail(e.getLocalizedMessage()); } }
From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java
License:Apache License
@Test public void baseUIMAPayloadsAnalyzerStreamTest() { try {/*from ww w . j a v a 2s. com*/ TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class); while (ts.incrementToken()) { assertNotNull(termAtt); assertNotNull(payloadAttribute); System.out.println("token '" + termAtt.toString() + "' has payload " + new String(payloadAttribute.getPayload().getData())); } } catch (Exception e) { e.printStackTrace(); fail(e.getLocalizedMessage()); } }
From source file:com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest.java
License:Apache License
@Test public void testSimpleUsage() { try {/*from w w w. java 2 s.c o m*/ UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml", "org.apache.uima.TokenAnnotation", "posTag"); TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class); PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); while (ts.incrementToken()) { assertNotNull(offsetAtt); assertNotNull(termAtt); assertNotNull(posAtt); assertNotNull(typeAttr); System.out.println("token '" + termAtt.toString() + "' has type " + typeAttr.type()); } } catch (Exception e) { e.printStackTrace(); fail(e.getLocalizedMessage()); } }
From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java
License:Apache License
public void testCreateAnalyzerWrapper() throws IOException { PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper(); TokenStream tokenStream = null; CharTermAttribute charTermAttribute = null; List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1")); List<String> actualIdTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("id", "1"); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/* ww w.j ava2 s . com*/ while (tokenStream.incrementToken()) { actualIdTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedIdTermList, actualIdTermList); List<String> expectedTextTermList = new LinkedList<String>( Arrays.asList("lucene", "is", "a", "full", "text", "search", "library")); List<String> actualTextTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library."); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { actualTextTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedTextTermList, actualTextTermList); }
From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java
License:Apache License
public List<String> tokenize(String text) { List<String> words = new ArrayList<String>(); if (text != null && !text.isEmpty()) { TokenStream tokenStream = this.createTokenStream(text); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try {//w ww . j a va2 s. co m while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); words.add(term); } } catch (IOException ioe) { LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe); } finally { try { tokenStream.end(); tokenStream.close(); } catch (IOException e) { // Can't do nothing!! LOGGER.error("Unable to close token stream : " + e.getMessage()); } } } return words; }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private void addAttributes(TokenStream tokenStream) { tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(ReadingAttribute.class); tokenStream.addAttribute(PartOfSpeechAttribute.class); tokenStream.addAttribute(InflectionAttribute.class); tokenStream.addAttribute(BaseFormAttribute.class); }
From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java
License:Apache License
public static org.apache.lucene.analysis.Token getNextToken(TokenStream input) throws IOException { org.apache.lucene.analysis.Token token = null; if (input.incrementToken()) { CharTermAttribute ccc = input.addAttribute(CharTermAttribute.class); Iterator<AttributeImpl> attIt = input.getAttributeImplsIterator(); if (attIt == null || !attIt.hasNext()) { return null; }/* ww w . j a v a2s.c o m*/ AttributeImpl att = attIt.next(); if (att instanceof GSAttributeImpl) { token = ((GSAttributeImpl) att).getToken(); } if (token == null && ccc != null && ccc.length() > 0) { String ttt = ccc.toString(); token = new org.apache.lucene.analysis.Token(ttt, 0, ttt.length()); } } return token; }
From source file:com.ibm.watson.developer_cloud.professor_languo.primary_search.SpanBigramQueryGeneratorTest.java
License:Open Source License
private void test_that_generated_bigram_query_match_the_referenced_query() throws IOException { // Get stemmed question SingletonAnalyzer.generateAnalyzer(PrimarySearchConstants.ENGLISH_ANALYZER); EnglishAnalyzer ea = (EnglishAnalyzer) SingletonAnalyzer.getAnalyzer(); TokenStream ts = ea.tokenStream("field", question); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); ts.reset();// ww w.ja va 2 s . c o m List<String> stemmedQuestion = new ArrayList<String>(); while (ts.incrementToken()) stemmedQuestion.add(charTermAttribute.toString()); // get query terms BooleanClause[] clauses = ((BooleanQuery) spanBigramQuery).getClauses(); SpanQuery[] queries; String term1, term2; List<String> unigrams = new ArrayList<String>(); int numFields = clauses.length / (2 * stemmedQuestion.size() - 1); // test bigrams int bigramidx = 0; for (int idx = 0; idx < clauses.length; idx++) { Query q = clauses[idx].getQuery(); if (q instanceof SpanNearQuery) { queries = ((SpanNearQuery) clauses[idx].getQuery()).getClauses(); int termidx = bigramidx / numFields; term1 = ((SpanTermQuery) queries[0]).getTerm().text(); term2 = ((SpanTermQuery) queries[1]).getTerm().text(); assertEquals("Extracted first term doesn't match the stemmed term", stemmedQuestion.get(termidx), term1); assertEquals("Extracted second term doesn't match the stemmed term", stemmedQuestion.get(termidx + 1), term2); bigramidx++; } else if (q instanceof TermQuery) { unigrams.add(((TermQuery) clauses[idx].getQuery()).getTerm().text()); } else { assertTrue("Unknown type of query found!", false); } } // test unigrams for (String s : unigrams) assertTrue(stemmedQuestion.contains(s)); for (String s : stemmedQuestion) assertTrue(unigrams.contains(s)); }