List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.index.Tag.java
private String getBagOfWords(String text) throws Exception { StringBuffer buff = new StringBuffer(); text = Question.removeTags(text);/* w w w.j av a 2 s.c o m*/ boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new SOAnalyzer(toStem, stopFile); TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:org.index.TermScore.java
private List<String> getBagOfWords(String text) throws Exception { List<String> terms = new ArrayList<>(); text = Question.removeTags(text);//from w w w. j a v a 2s. com boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true")); String stopFile = prop.getProperty("stopfile"); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9); /*SOAnalyzer(toStem, stopFile)*/; TokenStream stream = analyzer.tokenStream("bow", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); terms.add(term); } stream.end(); stream.close(); return terms; }
From source file:org.LexGrid.LexBIG.Impl.Extensions.GenericExtensions.search.SearchExtensionImpl.java
License:Open Source License
public List<String> tokenize(Analyzer analyzer, String field, String keywords) throws IOException { List<String> result = new ArrayList<String>(); StringReader reader = new StringReader(keywords); TokenStream stream = analyzer.tokenStream(field, reader); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); try {//w w w . ja va 2 s .c o m stream.reset(); while (stream.incrementToken()) { result.add(termAtt.toString()); } stream.close(); } finally { stream.close(); } return result; }
From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java
License:Open Source License
public static List<String> readTokenStream(TokenStream tok) throws IOException { List<String> terms = new ArrayList<String>(); CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class); try {//from w w w . j a v a 2 s. co m tok.reset(); while (tok.incrementToken()) { terms.add(termAtt.toString()); } tok.end(); } finally { tok.close(); } return terms; }
From source file:org.meresco.lucene.suggestion.SuggestionIndex.java
License:Open Source License
public List<String> shingles(String s) throws IOException { List<String> shingles = new ArrayList<String>(); TokenStream stream = this.shingleAnalyzer.tokenStream("ignored", s); stream.reset(); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { shingles.add(termAttribute.toString()); }/*from w w w . j av a 2s.c o m*/ stream.close(); return shingles; }
From source file:org.meresco.lucene.suggestion.SuggestionNGramIndex.java
License:Open Source License
public static List<String> ngrams(String s, Boolean trigram) throws IOException { List<String> ngram = new ArrayList<String>(); Analyzer ngramAnalyzer = trigram ? TRIGRAM_ANALYZER : BIGRAM_ANALYZER; TokenStream stream = ngramAnalyzer.tokenStream("ignored", s); stream.reset(); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { ngram.add(termAttribute.toString()); }//from w w w . j ava 2 s . co m stream.close(); return ngram; }
From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java
License:Open Source License
public List<String> tokenize(String textContent) { try {// w ww .ja va 2s . co m List<String> terms = new ArrayList<String>(); TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { terms.add(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); return terms; } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) { if (StringUtil.isEmpty(strOrig)) { return EMPTY_TOKENS_LIST; }// w w w . java 2s .c om List<Token> result = new ArrayList<Token>(64); final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset())); } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new Token[result.size()]); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
License:Open Source License
protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed, boolean filterDigits, boolean filterWhitespace) { if (StringUtil.isEmpty(str)) { return EMPTY_STRING_LIST; }//from w w w . ja v a 2 s. c o m List<String> result = new ArrayList<String>(64); final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed); in.addAttribute(CharTermAttribute.class); in.addAttribute(OffsetAttribute.class); CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class); OffsetAttribute off = in.getAttribute(OffsetAttribute.class); Locale loc = stemsAllowed ? getLanguage().getLocale() : null; try { in.reset(); while (in.incrementToken()) { String tokenText = cattr.toString(); if (acceptToken(tokenText, filterDigits, filterWhitespace)) { result.add(tokenText); if (stemsAllowed) { String origText = str.substring(off.startOffset(), off.endOffset()); if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) { result.add(origText); } } } } in.end(); in.close(); } catch (IOException ex) { // shouldn't happen } return result.toArray(new String[result.size()]); }
From source file:org.openedit.data.lucene.AnalyzingQueryParserWithStop.java
License:Apache License
/** * Returns the analyzed form for the given chunk * //from w w w .j a va 2 s. c o m * If the analyzer produces more than one output token from the given chunk, * a ParseException is thrown. * * @param field The target field * @param termStr The full term from which the given chunk is excerpted * @param chunk The portion of the given termStr to be analyzed * @return The result of analyzing the given chunk * @throws ParseException when analysis returns other than one output token */ protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException { String analyzed = null; TokenStream stream = null; try { stream = getAnalyzer().tokenStream(field, chunk); stream.reset(); CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); // get first and hopefully only output token if (stream.incrementToken()) { analyzed = termAtt.toString(); // try to increment again, there should only be one output token StringBuilder multipleOutputs = null; while (stream.incrementToken()) { if (null == multipleOutputs) { multipleOutputs = new StringBuilder(); multipleOutputs.append('"'); multipleOutputs.append(analyzed); multipleOutputs.append('"'); } multipleOutputs.append(','); multipleOutputs.append('"'); multipleOutputs.append(termAtt.toString()); multipleOutputs.append('"'); } stream.end(); if (null != multipleOutputs) { throw new ParseException(String.format(getLocale(), "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString())); } } else { // nothing returned by analyzer. Was it a stop word and the user accidentally // used an analyzer with stop words? stream.end(); //Need to just ignore this return null; //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk)); } } catch (IOException e) { throw new ParseException( String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr)); } finally { IOUtils.closeWhileHandlingException(stream); } return analyzed; }