Example usage for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.index.TermScore.java

private List<String> getBagOfWords(String text) throws Exception {

    List<String> terms = new ArrayList<>();
    text = Question.removeTags(text);//from ww  w  .ja  v a  2 s .  c o  m

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9);
    /*SOAnalyzer(toStem, stopFile)*/;
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        terms.add(term);
    }

    stream.end();
    stream.close();

    return terms;
}

From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java

License:Open Source License

public static List<String> readTokenStream(TokenStream tok) throws IOException {
    List<String> terms = new ArrayList<String>();
    CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class);
    try {/*from   w ww  .  jav  a2s .c o m*/
        tok.reset();
        while (tok.incrementToken()) {
            terms.add(termAtt.toString());
        }
        tok.end();
    } finally {
        tok.close();
    }
    return terms;
}

From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java

License:Open Source License

public List<String> tokenize(String textContent) {
    try {//from   ww w. j a  v  a 2s.  c o  m
        List<String> terms = new ArrayList<String>();
        TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            terms.add(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        return terms;
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
        final boolean filterDigits, final boolean filterWhitespace) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }//from   w  w w . j  ava  2s .c  o m

    List<Token> result = new ArrayList<Token>(64);

    final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
        boolean filterDigits, boolean filterWhitespace) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }// w  w  w  .  jav a2 s . c o  m

    List<String> result = new ArrayList<String>(64);

    final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    Locale loc = stemsAllowed ? getLanguage().getLocale() : null;

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(tokenText);
                if (stemsAllowed) {
                    String origText = str.substring(off.startOffset(), off.endOffset());
                    if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                        result.add(origText);
                    }
                }
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new String[result.size()]);
}

From source file:org.openedit.data.lucene.AnalyzingQueryParserWithStop.java

License:Apache License

/**
 * Returns the analyzed form for the given chunk
 * //from  ww  w .j  a v a  2s. c  o  m
 * If the analyzer produces more than one output token from the given chunk,
 * a ParseException is thrown.
 *
 * @param field The target field
 * @param termStr The full term from which the given chunk is excerpted
 * @param chunk The portion of the given termStr to be analyzed
 * @return The result of analyzing the given chunk
 * @throws ParseException when analysis returns other than one output token
 */
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException {
    String analyzed = null;
    TokenStream stream = null;
    try {
        stream = getAnalyzer().tokenStream(field, chunk);
        stream.reset();
        CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
        // get first and hopefully only output token
        if (stream.incrementToken()) {
            analyzed = termAtt.toString();

            // try to increment again, there should only be one output token
            StringBuilder multipleOutputs = null;
            while (stream.incrementToken()) {
                if (null == multipleOutputs) {
                    multipleOutputs = new StringBuilder();
                    multipleOutputs.append('"');
                    multipleOutputs.append(analyzed);
                    multipleOutputs.append('"');
                }
                multipleOutputs.append(',');
                multipleOutputs.append('"');
                multipleOutputs.append(termAtt.toString());
                multipleOutputs.append('"');
            }
            stream.end();
            if (null != multipleOutputs) {
                throw new ParseException(String.format(getLocale(),
                        "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
            }
        } else {
            // nothing returned by analyzer.  Was it a stop word and the user accidentally
            // used an analyzer with stop words?
            stream.end();
            //Need to just ignore this
            return null;
            //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
        }
    } catch (IOException e) {
        throw new ParseException(
                String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return analyzed;
}

From source file:org.opengrok.web.api.v1.suggester.query.SuggesterQueryParser.java

License:Open Source License

private static List<String> getAllTokens(final Analyzer analyzer, final String field, final String text) {
    List<String> tokens = new LinkedList<>();

    TokenStream ts = null;
    try {//from  w  ww . j av  a2  s.c o  m
        ts = analyzer.tokenStream(field, text);

        CharTermAttribute attr = ts.addAttribute(CharTermAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            tokens.add(attr.toString());
        }
    } catch (IOException e) {
        logger.log(Level.WARNING, "Could not analyze query text", e);
    } finally {
        try {
            if (ts != null) {
                ts.end();
                ts.close();
            }
        } catch (IOException e) {
            logger.log(Level.WARNING, "Could not close token stream", e);
        }
    }

    return tokens;
}

From source file:org.pageseeder.flint.lucene.query.Queries.java

License:Apache License

/**
 * Returns the terms for a field/*from  www  . ja v a 2s  .c o m*/
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) {
    try {
        TokenStream stream = analyzer.tokenStream(field, text);
        PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
        CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class);
        int position = -1;
        stream.reset();
        while (stream.incrementToken()) {
            position += increment.getPositionIncrement();
            Term term = new Term(field, attribute.toString());
            phrase.add(term, position);
        }
        stream.end();
        stream.close();
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
}

From source file:org.pageseeder.flint.lucene.query.Queries.java

License:Apache License

private static boolean isTokenized(String field, Analyzer analyzer) {
    // try to load terms for a phrase and return true if more than one term
    TokenStream stream = null;
    try {/*  w  ww  .  j  a v  a2s.c om*/
        stream = analyzer.tokenStream(field, "word1 word2");
        stream.reset();
        if (stream.incrementToken()) {
            return stream.incrementToken();
        }
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    } finally {
        if (stream != null)
            try {
                stream.end();
                stream.close();
            } catch (IOException ex) {
                // Should not occur since we use a StringReader
                ex.printStackTrace();
            }
    }
    return false;
}

From source file:org.pageseeder.flint.lucene.search.Fields.java

License:Apache License

/**
 * Returns the terms for a field/*from  w  w w. j a  v  a2  s  . c  o  m*/
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
public static List<String> toTerms(String field, String text, Analyzer analyzer) {
    List<String> terms = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
        CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = attribute.toString();
            terms.add(term);
        }
        stream.end();
        stream.close();
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
    return terms;
}