Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:de.ingrid.interfaces.csw.tools.LuceneTools.java

License:EUPL

/**
 * @param term//from  w w w .  j  a v  a2  s.co  m
 * @return filtered term
 * @throws IOException
 */
public String filterTerm(String term) throws IOException {
    String result = "";

    // always use same analyzer, NOT new instance ! Is called in mapping process !
    Analyzer myAnalyzer = getAnalyzer();
    TokenStream ts = myAnalyzer.tokenStream(null, new StringReader(term));
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

    while (ts.incrementToken()) {
        String t = charTermAttribute.toString();
        result = result + " " + t;
    }
    return result.trim();
}

From source file:de.jetwick.es.JetwickQuery.java

License:Apache License

public Set<String> doSnowballStemming(TokenStream ts) {
    Set<String> res = new LinkedHashSet<String>();
    ts = new SnowballFilter(ts, "English");
    try {/*from w ww.j a va 2s  .  c o  m*/
        while (ts.incrementToken()) {
            res.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException ex) {
        logger.error("Exception while stemming to snoball", ex);
    }

    return res;
}

From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java

License:Open Source License

private String analyze(String aFieldName, String aString) throws IOException {
    TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString);
    theTokenStream.reset();/*from www .  j  a  v  a2s. com*/
    CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class);
    try {
        if (theTokenStream.incrementToken()) {
            return theCharTerms.toString();
        }
        return null;
    } finally {
        theTokenStream.end();
        theTokenStream.close();
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addWildcardOrTermQueries(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    Query theTempQuery;/*www.j av  a 2s  . c om*/

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        if (isWildcardTerm(aTerm)) {
            theTempQuery = new WildcardQuery(new Term(aField, getCorrectedWildcardTerm(aTerm)));
        } else {
            theTempQuery = new TermQuery(new Term(aField, theTokenText));
        }
        aQuery.add(theTempQuery, Occur.MUST);
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery();

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        Term theTerm = new Term(aField, theTokenText);

        if (!isWildcardTerm(theTokenText)) {
            thePhraseQuery.add(theTerm);
        } else {//from   w ww . ja v  a2 s.  com
            Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text()));
            WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm);
            try {
                List<Term> theTerms = new ArrayList<Term>();
                do {
                    theTerms.add(theEnum.term());
                } while (theEnum.next());
                thePhraseQuery.add(theTerms.toArray(new Term[0]));
            } finally {
                theEnum.close();
            }
        }
    }

    aQuery.add(thePhraseQuery, Occur.MUST);
}

From source file:de.twitterlivesearch.analysis.Tokenizer.java

License:Apache License

/**
 * @param stringToAnalyze//  w  w w. j a  v a2  s.com
 *            String to be tokenized
 * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used
 *        for analysis
 *
 * @return list of tokens
 */
public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze));
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return tokens;
}

From source file:de.unidue.inf.is.ezdl.dlcore.data.extractor.TermExtractor.java

License:Open Source License

/**
 * Split the information cause in sense of term it is a standalone word.
 * TODO this method removes stopwords but don't detect any phrases.
 * /* w  w w .  ja v  a 2 s .  c om*/
 * @param result
 *            the list we will append the items
 * @param item
 *            the item itself.
 */
private void add(ExtractionResultImpl result, String item) {
    if (item != null) {
        inferLanguage(item);
        List<String> terms = new ArrayList<String>();

        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(item));
        // OffsetAttribute offsetAttribute =
        // tokenStream.getAttribute(OffsetAttribute.class);
        TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);

        try {
            while (tokenStream.incrementToken()) {
                // int startOffset = offsetAttribute.startOffset();
                // int endOffset = offsetAttribute.endOffset();
                String term = termAttribute.term();
                terms.add(term);
            }
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
        }

        terms = filter.filter(terms, locale);

        for (String t : terms) {
            if (!StringUtils.isEmpty((t))) {
                Entry e = new EntryImpl(t.toLowerCase(locale));
                result.add(e);
            }
        }
    }
}

From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java

License:Apache License

public static String tokenizeString(Analyzer analyzer, String string) {
    // Inspired by stackoverflow:
    // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string
    StringBuilder builder = new StringBuilder();
    try {/* ww w.  j a v a2 s .com*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            builder.append(stream.getAttribute(CharTermAttribute.class).toString());
            builder.append(" ");
        }
        stream.close();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return builder.toString().trim();
}

From source file:dependencies.ReviewDependencyAnalyzer.java

License:Open Source License

public ArrayList<ArrayList<Token>> getSentences(Reader reader) {

    try {//from  w ww  .  j a v  a2s.com
        // Send reader data through the analyzer
        TokenStream tokstr = reusableTokenStream("", reader);
        TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class);
        TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class);
        FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class);
        PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class);

        // Split the tokenstream returned by the analyzer into sentences. Convert each sentence
        // into a linked list of tokens
        ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>();
        ArrayList<Token> current_sentence = new ArrayList<Token>();

        while (tokstr.incrementToken()) {
            Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(),
                    new ReviewTermPayload(tok_payload.getPayload()));
            current_sentence.add(current_token);

            // End of sentence reached. Add current sentence to the sentence list
            if (current_token.isDelim(true)) {
                if (current_sentence.size() > 1) {
                    sentence_list.add(current_sentence);
                }
                current_sentence = new ArrayList<Token>();
            }
        }

        // At the end of the token stream, if there is an incomplete sentence, add it to the
        // sentence list.
        // This case could occur when the last sentence of a given passage does not end with a
        // period or other sentence delimiter.
        if (!current_sentence.isEmpty()) {
            sentence_list.add(current_sentence);
        }

        return sentence_list;
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE,
                "Error reading data from reader. Analyzing text for typed dependencies could not be completed");
        return null;
    }
}

From source file:di.uniba.it.tri.occ.BuildOccurrence.java

License:Open Source License

private List<String> getTokens(Reader reader) throws IOException {
    List<String> tokens = new ArrayList<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
    TokenStream tokenStream = analyzer.tokenStream("text", reader);
    tokenStream.reset();/*from   ww w .  ja  va 2s  .  co  m*/
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
        String token = cattr.toString();
        String[] split = token.split("'");
        if (split.length == 1) {
            tokens.add(token);
        } else {
            int max = 0;
            int index = 0;
            for (int i = 0; i < split.length; i++) {
                if (split[i].length() > max) {
                    max = split[i].length();
                    index = i;
                }
            }
            tokens.add(split[index]);
        }
    }
    tokenStream.end();
    return tokens;
}