List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.splevo.vpm.analyzer.semantic.lucene.LuceneCodeAnalyzer.java
License:Open Source License
/** * Stem a list of words with a configured stemmer. * * @param words//ww w . jav a 2 s . co m * The list of words to stem. * @param stemming * The stemmer to be used. * @return The stemmed list of words. */ @SuppressWarnings("resource") public static String[] stemWords(String[] words, Stemming stemming) { Set<String> stemmedStopWords = Sets.newHashSet(); for (String word : words) { TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word)); tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); stemmedStopWords.add(term); } } catch (IOException e) { logger.error("Failed to stem a list of words", e); } } return stemmedStopWords.toArray(new String[] {}); }
From source file:org.talend.dataquality.standardization.index.SynonymIndexSearcher.java
License:Open Source License
private List<String> getTokensFromAnalyzer(String input) throws IOException { StandardTokenizer tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(input)); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); TermAttribute termAttribute = result.addAttribute(TermAttribute.class); List<String> termList = new ArrayList<String>(); while (result.incrementToken()) { String term = termAttribute.term(); termList.add(term);/*from www .j a v a2s . com*/ } return termList; }
From source file:org.thiesen.jiffs.jobs.preprocessor.Preprocessor.java
License:Open Source License
private void preprocess(StoryDBO story, Analyzer analyzer) { final String cleanedText = new HtmlStripper().stripHtml(story.getFullText()); try {/*from w w w .java 2 s . c o m*/ final TokenStream tokenStream = analyzer.reusableTokenStream("dummy", new StringReader(cleanedText)); final TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class); final Collection<String> tokens = Sets.newHashSet(); while (tokenStream.incrementToken()) { final String token = termAtt.term(); if (StringUtils.isNotBlank(token)) { tokens.add(token); } } final String tokenString = Joiner.on(',').join(tokens); story.setPreprocessedText(tokenString); _storyDAO.update(story); } catch (IOException e) { throw new RuntimeException("IOException on in memory operation ", e); } }
From source file:org.tightblog.service.indexer.AbstractTask.java
License:Apache License
/** * Create a lucene term from the first token of the input string. * * @param field The lucene document field to create a term with * @param input The input you wish to convert into a term * @return Lucene search term// ww w. ja va2s .c o m */ Term getTerm(String field, String input) { Term term = null; if (input != null && field != null) { try (Analyzer analyzer = manager.getAnalyzer()) { if (analyzer != null) { try { TokenStream tokens = analyzer.tokenStream(field, new StringReader(input)); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); tokens.reset(); if (tokens.incrementToken()) { String termt = termAtt.toString(); term = new Term(field, termt); } } catch (IOException e) { // ignored } } } } return term; }
From source file:org.watermint.sourcecolon.org.opensolaris.opengrok.search.Summarizer.java
License:Apache License
private Token[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<Token> result = new ArrayList<>(); TokenStream ts = analyzer.tokenStream("full", new StringReader(text)); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { Token t = new Token(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t);//ww w.j a va 2 s. c o m } return result.toArray(new Token[result.size()]); }
From source file:org.weborganic.flint.util.Fields.java
License:artistic-license-2.0
/** * Returns the terms for a field//w w w.ja va 2 s . c o m * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ public static List<String> toTerms(String field, String text, Analyzer analyzer) { StringReader r = new StringReader(text); TokenStream stream = analyzer.tokenStream(field, r); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); TermAttribute attribute = stream.addAttribute(TermAttribute.class); List<String> terms = new ArrayList<String>(); try { stream.reset(); while (stream.incrementToken()) { String term = attribute.term(); terms.add(term); // TODO Use increment for the phrase query // System.err.println(term+":"+increment.getPositionIncrement()); } } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } return terms; }
From source file:org.weborganic.flint.util.Queries.java
License:artistic-license-2.0
/** * Returns the terms for a field//ww w . j a v a2s. c o m * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) { StringReader r = new StringReader(text); TokenStream stream = analyzer.tokenStream(field, r); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); TermAttribute attribute = stream.addAttribute(TermAttribute.class); try { int position = -1; stream.reset(); while (stream.incrementToken()) { position += increment.getPositionIncrement(); Term term = new Term(field, attribute.term()); phrase.add(term, position); } } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java
License:Apache License
public static ArrayList<String> getTopicWord(String str) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); ArrayList<String> retData = new ArrayList<String>(); // ?LuceneTokenStream TokenStream ts = null; try {/*from w w w. ja v a2s . c o m*/ ts = analyzer.tokenStream("myfield", new StringReader(str)); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) { retData.add(term.toString()); } } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } return retData; }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); // ?LuceneTokenStream TokenStream ts = null; try {/*from ww w. ja v a 2 s . c o m*/ ts = analyzer.tokenStream("myfield", new StringReader("???")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.ik_analyzer5.IKAnalzyerTest.java
License:Apache License
@Test public void testIK() { String text = "???"; //IK?smart??/* w ww . ja v a 2 s .co m*/ Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader(text)); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } analyzer.close(); } }