Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.index.Tag.java

private String getBagOfWords(String text) throws Exception {

    StringBuffer buff = new StringBuffer();
    text = Question.removeTags(text);/* w w  w.j  av a  2  s.c  o m*/

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new SOAnalyzer(toStem, stopFile);
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        buff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    return buff.toString();
}

From source file:org.index.TermScore.java

private List<String> getBagOfWords(String text) throws Exception {

    List<String> terms = new ArrayList<>();
    text = Question.removeTags(text);//from   w w  w. j  a v  a  2s.  com

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9);
    /*SOAnalyzer(toStem, stopFile)*/;
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        terms.add(term);
    }

    stream.end();
    stream.close();

    return terms;
}

From source file:org.LexGrid.LexBIG.Impl.Extensions.GenericExtensions.search.SearchExtensionImpl.java

License:Open Source License

public List<String> tokenize(Analyzer analyzer, String field, String keywords) throws IOException {
    List<String> result = new ArrayList<String>();
    StringReader reader = new StringReader(keywords);
    TokenStream stream = analyzer.tokenStream(field, reader);
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    try {//w w  w . ja  va  2 s .c  o  m
        stream.reset();
        while (stream.incrementToken()) {
            result.add(termAtt.toString());
        }
        stream.close();
    } finally {
        stream.close();
    }
    return result;
}

From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java

License:Open Source License

public static List<String> readTokenStream(TokenStream tok) throws IOException {
    List<String> terms = new ArrayList<String>();
    CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class);
    try {//from  w w  w .  j a v  a  2 s.  co  m
        tok.reset();
        while (tok.incrementToken()) {
            terms.add(termAtt.toString());
        }
        tok.end();
    } finally {
        tok.close();
    }
    return terms;
}

From source file:org.meresco.lucene.suggestion.SuggestionIndex.java

License:Open Source License

public List<String> shingles(String s) throws IOException {
    List<String> shingles = new ArrayList<String>();
    TokenStream stream = this.shingleAnalyzer.tokenStream("ignored", s);
    stream.reset();
    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        shingles.add(termAttribute.toString());
    }/*from w  w w  .  j av  a  2s.c  o  m*/
    stream.close();
    return shingles;
}

From source file:org.meresco.lucene.suggestion.SuggestionNGramIndex.java

License:Open Source License

public static List<String> ngrams(String s, Boolean trigram) throws IOException {
    List<String> ngram = new ArrayList<String>();
    Analyzer ngramAnalyzer = trigram ? TRIGRAM_ANALYZER : BIGRAM_ANALYZER;
    TokenStream stream = ngramAnalyzer.tokenStream("ignored", s);
    stream.reset();
    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        ngram.add(termAttribute.toString());
    }//from  w  w w  . j  ava 2 s  .  co m
    stream.close();
    return ngram;
}

From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java

License:Open Source License

public List<String> tokenize(String textContent) {
    try {// w  ww .ja  va  2s .  co m
        List<String> terms = new ArrayList<String>();
        TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            terms.add(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        return terms;
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
        final boolean filterDigits, final boolean filterWhitespace) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }//  w  w  w  . java  2s  .c om

    List<Token> result = new ArrayList<Token>(64);

    final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
        boolean filterDigits, boolean filterWhitespace) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }//from  w w w .  ja v  a  2 s.  c o  m

    List<String> result = new ArrayList<String>(64);

    final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    Locale loc = stemsAllowed ? getLanguage().getLocale() : null;

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(tokenText);
                if (stemsAllowed) {
                    String origText = str.substring(off.startOffset(), off.endOffset());
                    if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                        result.add(origText);
                    }
                }
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new String[result.size()]);
}

From source file:org.openedit.data.lucene.AnalyzingQueryParserWithStop.java

License:Apache License

/**
 * Returns the analyzed form for the given chunk
 * //from   w w w .j a va 2  s.  c o m
 * If the analyzer produces more than one output token from the given chunk,
 * a ParseException is thrown.
 *
 * @param field The target field
 * @param termStr The full term from which the given chunk is excerpted
 * @param chunk The portion of the given termStr to be analyzed
 * @return The result of analyzing the given chunk
 * @throws ParseException when analysis returns other than one output token
 */
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException {
    String analyzed = null;
    TokenStream stream = null;
    try {
        stream = getAnalyzer().tokenStream(field, chunk);
        stream.reset();
        CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
        // get first and hopefully only output token
        if (stream.incrementToken()) {
            analyzed = termAtt.toString();

            // try to increment again, there should only be one output token
            StringBuilder multipleOutputs = null;
            while (stream.incrementToken()) {
                if (null == multipleOutputs) {
                    multipleOutputs = new StringBuilder();
                    multipleOutputs.append('"');
                    multipleOutputs.append(analyzed);
                    multipleOutputs.append('"');
                }
                multipleOutputs.append(',');
                multipleOutputs.append('"');
                multipleOutputs.append(termAtt.toString());
                multipleOutputs.append('"');
            }
            stream.end();
            if (null != multipleOutputs) {
                throw new ParseException(String.format(getLocale(),
                        "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
            }
        } else {
            // nothing returned by analyzer.  Was it a stop word and the user accidentally
            // used an analyzer with stop words?
            stream.end();
            //Need to just ignore this
            return null;
            //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
        }
    } catch (IOException e) {
        throw new ParseException(
                String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return analyzed;
}