List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.LexGrid.LexBIG.Impl.Extensions.GenericExtensions.search.SearchExtensionImpl.java
License:Open Source License
public List<String> tokenize(Analyzer analyzer, String field, String keywords) throws IOException { List<String> result = new ArrayList<String>(); StringReader reader = new StringReader(keywords); TokenStream stream = analyzer.tokenStream(field, reader); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); try {//from w w w . j a va 2 s .c o m stream.reset(); while (stream.incrementToken()) { result.add(termAtt.toString()); } stream.close(); } finally { stream.close(); } return result; }
From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java
License:Open Source License
public static List<String> readTokenStream(TokenStream tok) throws IOException { List<String> terms = new ArrayList<String>(); CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class); try {/*from w w w . ja va2 s . co m*/ tok.reset(); while (tok.incrementToken()) { terms.add(termAtt.toString()); } tok.end(); } finally { tok.close(); } return terms; }
From source file:org.meresco.lucene.suggestion.SuggestionIndex.java
License:Open Source License
public List<String> shingles(String s) throws IOException { List<String> shingles = new ArrayList<String>(); TokenStream stream = this.shingleAnalyzer.tokenStream("ignored", s); stream.reset();//from w w w. j a v a2 s. c o m CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { shingles.add(termAttribute.toString()); } stream.close(); return shingles; }
From source file:org.meresco.lucene.suggestion.SuggestionNGramIndex.java
License:Open Source License
public static List<String> ngrams(String s, Boolean trigram) throws IOException { List<String> ngram = new ArrayList<String>(); Analyzer ngramAnalyzer = trigram ? TRIGRAM_ANALYZER : BIGRAM_ANALYZER; TokenStream stream = ngramAnalyzer.tokenStream("ignored", s); stream.reset();//w w w. j a v a2 s.c om CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { ngram.add(termAttribute.toString()); } stream.close(); return ngram; }
From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java
License:Open Source License
public List<String> tokenize(String textContent) { try {//from w w w .jav a2 s .c om List<String> terms = new ArrayList<String>(); TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); return terms; } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) { if (StringUtil.isEmpty(strOrig)) { return EMPTY_TOKENS_LIST; }/*from ww w .j a va 2 s. com*/ List<Token> result = new ArrayList<Token>(64); final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset())); } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new Token[result.size()]); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed, boolean filterDigits, boolean filterWhitespace) { if (StringUtil.isEmpty(str)) { return EMPTY_STRING_LIST; }//w w w . j av a2s . c o m List<String> result = new ArrayList<String>(64); final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); Locale loc = stemsAllowed ? getLanguage().getLocale() : null; try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(tokenText); if (stemsAllowed) { String origText = str.substring(off.startOffset(), off.endOffset()); if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) { result.add(origText); } } } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new String[result.size()]); }
From source file:org.opengrok.web.api.v1.suggester.query.SuggesterQueryParser.java
License:Open Source License
private static List<String> getAllTokens(final Analyzer analyzer, final String field, final String text) { List<String> tokens = new LinkedList<>(); TokenStream ts = null; try {/*from w ww. j a v a 2 s .c o m*/ ts = analyzer.tokenStream(field, text); CharTermAttribute attr = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { tokens.add(attr.toString()); } } catch (IOException e) { logger.log(Level.WARNING, "Could not analyze query text", e); } finally { try { if (ts != null) { ts.end(); ts.close(); } } catch (IOException e) { logger.log(Level.WARNING, "Could not close token stream", e); } } return tokens; }
From source file:org.pageseeder.flint.lucene.query.Queries.java
License:Apache License
/** * Returns the terms for a field/*w w w. j a v a 2 s . c o m*/ * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) { try { TokenStream stream = analyzer.tokenStream(field, text); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { position += increment.getPositionIncrement(); Term term = new Term(field, attribute.toString()); phrase.add(term, position); } stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } }
From source file:org.pageseeder.flint.lucene.query.Queries.java
License:Apache License
private static boolean isTokenized(String field, Analyzer analyzer) { // try to load terms for a phrase and return true if more than one term TokenStream stream = null; try {/* ww w . j a va 2s. c om*/ stream = analyzer.tokenStream(field, "word1 word2"); stream.reset(); if (stream.incrementToken()) { return stream.incrementToken(); } } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } finally { if (stream != null) try { stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } } return false; }