Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:de.ingrid.interfaces.csw.tools.LuceneTools.java

License:EUPL

/**
 * @param term//from  w w w .  j  a v  a2  s.co  m
 * @return filtered term
 * @throws IOException
 */
public String filterTerm(String term) throws IOException {
    String result = "";

    // always use same analyzer, NOT new instance ! Is called in mapping process !
    Analyzer myAnalyzer = getAnalyzer();
    TokenStream ts = myAnalyzer.tokenStream(null, new StringReader(term));
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

    while (ts.incrementToken()) {
        String t = charTermAttribute.toString();
        result = result + " " + t;
    }
    return result.trim();
}

From source file:de.jetwick.es.JetwickQuery.java

License:Apache License

public Set<String> doSnowballStemming(TokenStream ts) {
    Set<String> res = new LinkedHashSet<String>();
    ts = new SnowballFilter(ts, "English");
    try {/*from w ww.j a va 2s  .  c o  m*/
        while (ts.incrementToken()) {
            res.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException ex) {
        logger.error("Exception while stemming to snoball", ex);
    }

    return res;
}

From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java

License:Open Source License

private String analyze(String aFieldName, String aString) throws IOException {
    TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString);
    theTokenStream.reset();/*from www .  j  a  v  a2s. com*/
    CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class);
    try {
        if (theTokenStream.incrementToken()) {
            return theCharTerms.toString();
        }
        return null;
    } finally {
        theTokenStream.end();
        theTokenStream.close();
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addWildcardOrTermQueries(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    Query theTempQuery;/*www.j av  a 2s  . c om*/

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        if (isWildcardTerm(aTerm)) {
            theTempQuery = new WildcardQuery(new Term(aField, getCorrectedWildcardTerm(aTerm)));
        } else {
            theTempQuery = new TermQuery(new Term(aField, theTokenText));
        }
        aQuery.add(theTempQuery, Occur.MUST);
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery();

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        Term theTerm = new Term(aField, theTokenText);

        if (!isWildcardTerm(theTokenText)) {
            thePhraseQuery.add(theTerm);
        } else {//from   w ww . ja v  a2 s.  com
            Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text()));
            WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm);
            try {
                List<Term> theTerms = new ArrayList<Term>();
                do {
                    theTerms.add(theEnum.term());
                } while (theEnum.next());
                thePhraseQuery.add(theTerms.toArray(new Term[0]));
            } finally {
                theEnum.close();
            }
        }
    }

    aQuery.add(thePhraseQuery, Occur.MUST);
}

From source file:de.twitterlivesearch.analysis.Tokenizer.java

License:Apache License

/**
 * @param stringToAnalyze//  w  w w. j a  v a2  s.com
 *            String to be tokenized
 * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used
 *        for analysis
 *
 * @return list of tokens
 */
public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze));
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return tokens;
}

From source file:de.unidue.inf.is.ezdl.dlcore.data.extractor.TermExtractor.java

License:Open Source License

/**
 * Split the information cause in sense of term it is a standalone word.
 * TODO this method removes stopwords but don't detect any phrases.
 * /* w  w w .  ja v  a 2 s .  c om*/
 * @param result
 *            the list we will append the items
 * @param item
 *            the item itself.
 */
private void add(ExtractionResultImpl result, String item) {
    if (item != null) {
        inferLanguage(item);
        List<String> terms = new ArrayList<String>();

        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(item));
        // OffsetAttribute offsetAttribute =
        // tokenStream.getAttribute(OffsetAttribute.class);
        TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);

        try {
            while (tokenStream.incrementToken()) {
                // int startOffset = offsetAttribute.startOffset();
                // int endOffset = offsetAttribute.endOffset();
                String term = termAttribute.term();
                terms.add(term);
            }
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
        }

        terms = filter.filter(terms, locale);

        for (String t : terms) {
            if (!StringUtils.isEmpty((t))) {
                Entry e = new EntryImpl(t.toLowerCase(locale));
                result.add(e);
            }
        }
    }
}

From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java

License:Apache License

public static String tokenizeString(Analyzer analyzer, String string) {
    // Inspired by stackoverflow:
    // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string
    StringBuilder builder = new StringBuilder();
    try {/* ww w.  j a v a2 s .com*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            builder.append(stream.getAttribute(CharTermAttribute.class).toString());
            builder.append(" ");
        }
        stream.close();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return builder.toString().trim();
}

From source file:dependencies.ReviewDependencyAnalyzer.java

License:Open Source License

public ArrayList<ArrayList<Token>> getSentences(Reader reader) {

    try {//from  w ww  .  j a v  a2s.com
        // Send reader data through the analyzer
        TokenStream tokstr = reusableTokenStream("", reader);
        TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class);
        TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class);
        FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class);
        PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class);

        // Split the tokenstream returned by the analyzer into sentences. Convert each sentence
        // into a linked list of tokens
        ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>();
        ArrayList<Token> current_sentence = new ArrayList<Token>();

        while (tokstr.incrementToken()) {
            Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(),
                    new ReviewTermPayload(tok_payload.getPayload()));
            current_sentence.add(current_token);

            // End of sentence reached. Add current sentence to the sentence list
            if (current_token.isDelim(true)) {
                if (current_sentence.size() > 1) {
                    sentence_list.add(current_sentence);
                }
                current_sentence = new ArrayList<Token>();
            }
        }

        // At the end of the token stream, if there is an incomplete sentence, add it to the
        // sentence list.
        // This case could occur when the last sentence of a given passage does not end with a
        // period or other sentence delimiter.
        if (!current_sentence.isEmpty()) {
            sentence_list.add(current_sentence);
        }

        return sentence_list;
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE,
                "Error reading data from reader. Analyzing text for typed dependencies could not be completed");
        return null;
    }
}

From source file:di.uniba.it.tri.occ.BuildOccurrence.java

License:Open Source License

private List<String> getTokens(Reader reader) throws IOException {
    List<String> tokens = new ArrayList<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
    TokenStream tokenStream = analyzer.tokenStream("text", reader);
    tokenStream.reset();/*from   ww w .  ja  va 2s  .  co  m*/
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
        String token = cattr.toString();
        String[] split = token.split("'");
        if (split.length == 1) {
            tokens.add(token);
        } else {
            int max = 0;
            int index = 0;
            for (int i = 0; i < split.length; i++) {
                if (split[i].length() > max) {
                    max = split[i].length();
                    index = i;
                }
            }
            tokens.add(split[index]);
        }
    }
    tokenStream.end();
    return tokens;
}