List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:fry.future.plugin.example.APP.java
private static List<String> tokenString(Analyzer analyzer, String str) throws IOException { List<String> result = new ArrayList<>(); TokenStream tokenStream = analyzer.tokenStream("Test", new StringReader(str)); tokenStream.reset();/*from w ww .j a va 2s. c o m*/ while (tokenStream.incrementToken()) { result.add(tokenStream.getAttribute(CharTermAttribute.class).toString()); } return result; }
From source file:hivemall.nlp.tokenizer.KuromojiUDF.java
License:Apache License
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException { // instantiate an attribute placeholder once CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); stream.reset();//from w w w . java 2 s . c o m while (stream.incrementToken()) { String term = termAttr.toString(); results.add(new Text(term)); } }
From source file:info.johtani.jjug.lucene.sample.TokenizeSample.java
License:Apache License
private static void printToken(String text, Analyzer analyzer) { System.out.println("--- Original: [" + text + "]"); try {//from w w w . j av a 2 s . c o m TokenStream tokens = analyzer.tokenStream("content", text); tokens.reset(); CharTermAttribute termAttr = tokens.getAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { System.out.println("[" + termAttr.toString() + "]"); } tokens.reset(); } catch (IOException e) { e.printStackTrace(); } }
From source file:ivory.core.tokenize.Tokenizer.java
License:Apache License
/** * Convert tokenStream object into a string. * // w w w. j a va2 s .co m * @param tokenStream * object returned by Lucene tokenizer * @return * String corresponding to the tokens output by tokenStream */ protected static String streamToString(TokenStream tokenStream) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.clearAttributes(); StringBuilder tokenized = new StringBuilder(); try { while (tokenStream.incrementToken()) { tokenized.append(termAtt.toString() + " "); } } catch (IOException e) { e.printStackTrace(); } return tokenized.toString().trim(); }
From source file:jaligner.Sequence.java
License:Open Source License
/** * Constructor/* w ww . j a va2s . com*/ * * @param sequence */ public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException { super(); this.sequence = sequence; TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence)); Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory( stream.getAttributeFactory()); Vector<Token> tokenVector = new Vector<Token>(); while (stream.incrementToken() && tokenVector.size() < max_length) { // Token token = new Token(); // Token token = (Token) stream.getAttribute(CharTermAttribute.class); Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class); CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class); // PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class); // FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class); // public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset()); token.setOffset(offset.startOffset(), offset.endOffset()); // token.setPayload(payload.getPayload()); // token.setFlags(flags.getFlags()); if (stream.hasAttribute(PositionIncrementAttribute.class)) { PositionIncrementAttribute positionIncrement = stream .getAttribute(PositionIncrementAttribute.class); token.setPositionIncrement(positionIncrement.getPositionIncrement()); } if (stream.hasAttribute(TypeAttribute.class)) { TypeAttribute type = stream.getAttribute(TypeAttribute.class); token.setType(type.type()); } tokenVector.add(token); } stream.end(); stream.close(); this.tokens = tokenVector.toArray(new Token[tokenVector.size()]); }
From source file:jobs.LoadOntologyJob.java
private int getTotalLength(String label) throws IOException { //Analyzer doesn't remomve stop words Analyzer customanalyzer = new CustomStopWordsStandardAnalyzer(Version.LUCENE_47); List<String> resultStop = new ArrayList<String>(); TokenStream customstream = customanalyzer.tokenStream(null, new StringReader(label)); customstream.reset();//from w w w. ja v a 2s . c om while (customstream.incrementToken()) { resultStop.add(customstream.getAttribute(CharTermAttribute.class).toString()); } return resultStop.size(); }
From source file:jobs.LoadOntologyJob.java
private int getLengthWithoutStopWords(String label) throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); List<String> result = new ArrayList<String>(); TokenStream stream = analyzer.tokenStream(null, new StringReader(label)); stream.reset();/* w w w . jav a 2s . co m*/ while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } return result.size(); }
From source file:jp.co.atware.solr.analizers.cjk.CJKBigramFilterTest.java
License:Apache License
@Theory public void testIncrementToken(Fixture testData) throws Exception { TokenStream tokenStream = getTokenStream(testData.input); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); List<String> actual = new ArrayList<String>(); while (tokenStream.incrementToken()) { actual.add(termAtt.toString());//w ww. j a v a 2 s . c o m } assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected)); }
From source file:jp.co.atware.solr.analizers.cjk.CranioCaudalFilterTest.java
License:Apache License
@Theory public void testIncrementToken(TestData testData) throws Exception { TokenStream tokenStream = createTokenStream(testData.input); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); List<String> actual = new ArrayList<String>(); while (tokenStream.incrementToken()) { actual.add(termAtt.toString());//www . j a v a 2 s .c o m } assertThat(actual.toArray(EMPTY_STRING_ARRAY), is(testData.expected)); }
From source file:jp.co.atware.solr.analizers.cjk.MultistageMappingCharFilterTest.java
License:Apache License
@Theory public void testMultiMappingAndOffset(TestData testData) throws Exception { Reader reader = charFilterFactory.create(new StringReader(testData.input)); TokenStream tokenStream = tokenizerFactory.create(reader); OffsetAttribute actualOffset = tokenStream.getAttribute(OffsetAttribute.class); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset();//from ww w . j a va 2s . co m assertThat(tokenStream.incrementToken(), is(true)); assertThat(termAtt.toString(), is(testData.expected)); assertThat(actualOffset.startOffset(), is(testData.start)); assertThat(actualOffset.endOffset(), is(testData.end)); assertThat(tokenStream.incrementToken(), is(false)); }