Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java

License:Apache License

public static Set<String> analyzer(String sentence) throws Exception {
    if (StringUtils.isEmpty(sentence)) {
        return Collections.emptySet();
    }//from w  ww.j  av a 2  s . co m

    Analyzer analyzer = new StandardAnalyzer();
    try {
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new LinkedHashSet<>();
        while (tokenStream.incrementToken()) {
            String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString();

            if (word.length() <= 1) {
                continue;
            }

            words.add(word);
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public void tokenise() throws IOException {
    String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used).
    if (ignoredElements != null && ignoredElements.length() > 0)
        ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</"
                + ignoredElements + " *>";
    if (!tagIndexing)
        ignregexp = ignregexp + "|<.*?>";
    //ignregexp = ignregexp+"|\\W\\W+";

    Pattern p = Pattern.compile(ignregexp);
    Matcher igns = p.matcher(originalText);

    StringBuffer tx = new StringBuffer(originalText);
    int ct = 1;/* www .  j  a  va  2  s  . c o  m*/
    while (igns.find()) {
        int s = igns.start();
        int e = igns.end();
        if (verbose)
            PrintUtil.printNoMove("Processing exclusions ...", ct++);
        //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------");
        char sp[] = new char[e - s];
        for (int j = 0; j < sp.length; j++) {
            sp[j] = ' ';
        }
        tx.replace(s, e, new String(sp));
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
    //verbose = false;
    String text = new String(tx);
    //System.out.println("-->"+text+"<--");
    Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true,
            org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
    TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
    //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
    stream = new CJKWidthFilter(stream);
    //stream = new StopFilter(matchVersion, stream, stopwords);
    stream = new JapaneseKatakanaStemFilter(stream);
    //stream = new LowerCaseFilter(matchVersion, stream);

    OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

    while (stream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String token = charTermAttribute.toString();
        tokenMap.putPos(token, startOffset);
        //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
    }
    if (verbose)
        PrintUtil.donePrinting();
    ct = 1;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public List<String> split(String s) {
    ArrayList<String> ret = new ArrayList<String>();
    try {// w ww  .  j a  v a 2 s  .  c o  m
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(token);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {//ww  w . jav a 2  s  .  c  om
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(startOffset, endOffset);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:mvm.rya.indexing.accumulo.freetext.LuceneTokenizer.java

License:Apache License

@Override
public SortedSet<String> tokenize(String string) {
    SortedSet<String> set = new TreeSet<String>();
    try {//from  w  w  w  .j ava2 s . c  om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            set.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }

    return set;
}

From source file:net.mad.ads.server.utils.http.KeywordUtils.java

License:Open Source License

public static List<String> getTokens(String queryString) {
    try {//  ww w.j ava 2  s  .com
        GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33);

        TokenStream ts = a.tokenStream("", new StringReader(queryString));

        List<String> tokens = new ArrayList<String>();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String token = termAtt.toString();
            tokens.add(token);
        }
        ts.end();
        ts.close();

        return tokens;
    } catch (IOException e) {
        logger.error("", e);
    }
    return null;
}

From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java

License:Open Source License

private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException {
    TokenStream ts = analyzer.tokenStream(fld, new StringReader(val));
    try {//from   w  ww  . j a v  a2 s . co  m
        ts.reset();
        // Iterate over tokens and treat each token as term
        int pos = 0;
        while (ts.incrementToken()) {
            CharTermAttribute t = ts.getAttribute(CharTermAttribute.class);
            PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class);
            pos += p.getPositionIncrement();
            phrase.add(new Term(fld, t.toString()), pos - 1);
        }
        // End-of-stream clean-up
        ts.end();
    } finally {
        ts.close();
    }
}

From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java

License:Apache License

/**
 * {@inheritDoc}//from ww w. j a  v a2  s. c om
 */
@Override
public Term createTerm(String field, Object value) {

    NlsNullPointerException.checkNotNull("field", field);
    NlsNullPointerException.checkNotNull("value", value);
    String normalizedValue;
    SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field);
    SearchFieldType fieldType = fieldConfiguration.getType();
    boolean isString = (value instanceof String);
    try {
        switch (fieldType) {
        case TEXT:
            try {
                TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader((String) value));
                TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
                if (tokenStream.incrementToken()) {
                    normalizedValue = termAttribute.term();
                } else {
                    normalizedValue = "";
                }
            } catch (IOException e) {
                throw new RuntimeIoException(e, IoMode.READ);
            }
            break;
        case STRING:
            normalizedValue = (String) value;
            break;
        case INTEGER:
            int i;
            if (isString) {
                i = Integer.parseInt((String) value);
            } else {
                i = ((Integer) value).intValue();
            }
            normalizedValue = NumericUtils.intToPrefixCoded(i);
            break;
        case LONG:
            long l;
            if (isString) {
                l = Long.parseLong((String) value);
            } else {
                l = ((Long) value).longValue();
            }
            normalizedValue = NumericUtils.longToPrefixCoded(l);
            break;
        case FLOAT:
            float f;
            if (isString) {
                f = Float.parseFloat((String) value);
            } else {
                f = ((Float) value).floatValue();
            }
            normalizedValue = NumericUtils.floatToPrefixCoded(f);
            break;
        case DOUBLE:
            double d;
            if (isString) {
                d = Double.parseDouble((String) value);
            } else {
                d = ((Double) value).doubleValue();
            }
            normalizedValue = NumericUtils.doubleToPrefixCoded(d);
            break;
        case DATE:
            Date date;
            if (isString) {
                date = this.iso8601Util.parseDate((String) value);
            } else {
                date = (Date) value;
            }
            normalizedValue = NumericUtils.longToPrefixCoded(date.getTime());
            break;
        default:
            throw new IllegalCaseException(SearchFieldType.class, fieldType);
        }
    } catch (ClassCastException e) {
        throw new NlsClassCastException(e, value, fieldType.getFieldClass());
    }
    return new Term(field, normalizedValue);
}

From source file:net.sf.mmm.search.engine.impl.lucene.LuceneFieldManagerImpl.java

License:Apache License

/**
 * {@inheritDoc}/*w w w .ja  v a2 s .c  om*/
 */
@Override
public Query createPhraseQuery(String field, String value) {

    NlsNullPointerException.checkNotNull("field", field);
    NlsNullPointerException.checkNotNull("value", value);
    SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration(field);
    SearchFieldType fieldType = fieldConfiguration.getType();
    Query result;
    if (fieldType == SearchFieldType.TEXT) {
        PhraseQuery phraseQuery = new PhraseQuery();
        result = phraseQuery;
        try {
            TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader(value));
            TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
            while (tokenStream.incrementToken()) {
                phraseQuery.add(new Term(field, termAttribute.term()));
            }
        } catch (IOException e) {
            throw new RuntimeIoException(e, IoMode.READ);
        }
    } else {
        result = new TermQuery(createTerm(field, value));
    }
    return result;
}

From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java

License:Open Source License

public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max,
        int threshold, HashMap<String, String> attributes) {
    float searchThreshold = (float) threshold;
    if (threshold < 0)
        searchThreshold = 0.0f;/*from   ww  w. j  a  va 2s.com*/
    if (threshold > 100)
        searchThreshold = 100.0f;

    String queryText = genericText;

    String gtextFName = TmEntry.GTEXT_PREFIX + locale;
    Locale javaLoc = new Locale(locale);

    // create basic ngram analyzer to tokenize query
    TokenStream queryTokenStream;
    if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) {
        queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText));
    } else {
        queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText));
    }

    // Get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class);
    TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName);

    try {
        queryTokenStream.reset();
        while (queryTokenStream.incrementToken()) {
            //Term t = new Term(keyIndexField, new String(termAtt.buffer()));
            Term t = new Term(gtextFName, termAtt.toString());
            fQuery.add(t);
        }
        queryTokenStream.end();
        queryTokenStream.close();
    } catch (IOException e) {
        throw new OkapiIOException(e.getMessage(), e);
    }

    return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes);
}