List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.index.TermScore.java
private List<String> getBagOfWords(String text) throws Exception { List<String> terms = new ArrayList<>(); text = Question.removeTags(text);//from ww w .ja v a 2 s . c o m boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9); /*SOAnalyzer(toStem, stopFile)*/; TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); terms.add(term); } stream.end(); stream.close(); return terms; }
From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java
License:Open Source License
public static List<String> readTokenStream(TokenStream tok) throws IOException { List<String> terms = new ArrayList<String>(); CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class); try {/*from w ww . jav a2s .c o m*/ tok.reset(); while (tok.incrementToken()) { terms.add(termAtt.toString()); } tok.end(); } finally { tok.close(); } return terms; }
From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java
License:Open Source License
public List<String> tokenize(String textContent) { try {//from ww w. j a v a 2s. c o m List<String> terms = new ArrayList<String>(); TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); return terms; } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) { if (StringUtil.isEmpty(strOrig)) { return EMPTY_TOKENS_LIST; }//from w w w . j ava 2s .c o m List<Token> result = new ArrayList<Token>(64); final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset())); } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new Token[result.size()]); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed, boolean filterDigits, boolean filterWhitespace) { if (StringUtil.isEmpty(str)) { return EMPTY_STRING_LIST; }// w w w . jav a2 s . c o m List<String> result = new ArrayList<String>(64); final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); Locale loc = stemsAllowed ? getLanguage().getLocale() : null; try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(tokenText); if (stemsAllowed) { String origText = str.substring(off.startOffset(), off.endOffset()); if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) { result.add(origText); } } } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new String[result.size()]); }
From source file:org.openedit.data.lucene.AnalyzingQueryParserWithStop.java
License:Apache License
/** * Returns the analyzed form for the given chunk * //from ww w .j a v a 2s. c o m * If the analyzer produces more than one output token from the given chunk, * a ParseException is thrown. * * @param field The target field * @param termStr The full term from which the given chunk is excerpted * @param chunk The portion of the given termStr to be analyzed * @return The result of analyzing the given chunk * @throws ParseException when analysis returns other than one output token */ protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException { String analyzed = null; TokenStream stream = null; try { stream = getAnalyzer().tokenStream(field, chunk); stream.reset(); CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); // get first and hopefully only output token if (stream.incrementToken()) { analyzed = termAtt.toString(); // try to increment again, there should only be one output token StringBuilder multipleOutputs = null; while (stream.incrementToken()) { if (null == multipleOutputs) { multipleOutputs = new StringBuilder(); multipleOutputs.append('"'); multipleOutputs.append(analyzed); multipleOutputs.append('"'); } multipleOutputs.append(','); multipleOutputs.append('"'); multipleOutputs.append(termAtt.toString()); multipleOutputs.append('"'); } stream.end(); if (null != multipleOutputs) { throw new ParseException(String.format(getLocale(), "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString())); } } else { // nothing returned by analyzer. Was it a stop word and the user accidentally // used an analyzer with stop words? stream.end(); //Need to just ignore this return null; //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk)); } } catch (IOException e) { throw new ParseException( String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr)); } finally { IOUtils.closeWhileHandlingException(stream); } return analyzed; }
From source file:org.opengrok.web.api.v1.suggester.query.SuggesterQueryParser.java
License:Open Source License
private static List<String> getAllTokens(final Analyzer analyzer, final String field, final String text) { List<String> tokens = new LinkedList<>(); TokenStream ts = null; try {//from w ww . j av a2 s.c o m ts = analyzer.tokenStream(field, text); CharTermAttribute attr = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { tokens.add(attr.toString()); } } catch (IOException e) { logger.log(Level.WARNING, "Could not analyze query text", e); } finally { try { if (ts != null) { ts.end(); ts.close(); } } catch (IOException e) { logger.log(Level.WARNING, "Could not close token stream", e); } } return tokens; }
From source file:org.pageseeder.flint.lucene.query.Queries.java
License:Apache License
/** * Returns the terms for a field/*from www . ja v a 2s .c o m*/ * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) { try { TokenStream stream = analyzer.tokenStream(field, text); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { position += increment.getPositionIncrement(); Term term = new Term(field, attribute.toString()); phrase.add(term, position); } stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } }
From source file:org.pageseeder.flint.lucene.query.Queries.java
License:Apache License
private static boolean isTokenized(String field, Analyzer analyzer) { // try to load terms for a phrase and return true if more than one term TokenStream stream = null; try {/* w ww . j a v a2s.c om*/ stream = analyzer.tokenStream(field, "word1 word2"); stream.reset(); if (stream.incrementToken()) { return stream.incrementToken(); } } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } finally { if (stream != null) try { stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } } return false; }
From source file:org.pageseeder.flint.lucene.search.Fields.java
License:Apache License
/** * Returns the terms for a field/*from w w w. j a v a2 s . c o m*/ * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ public static List<String> toTerms(String field, String text, Analyzer analyzer) { List<String> terms = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = attribute.toString(); terms.add(term); } stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } return terms; }