Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void extractTerms(String text, Collection<String> termSet) throws IOException {
    if (text == null)
        return;/* w w  w.  j  a  va2s .  c o m*/
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    try {
        ts = new TermSetTokenFilter(termSet, ts);
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void populate(String text, ResultNamedEntityExtraction result) throws IOException {
    if (text == null)
        return;/*  w  w w .  j a  v  a2 s.co m*/
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    ts = new NamedEntityPopulateFilter(result, ts);
    try {
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void populate(String text, FieldContent fieldContent) throws IOException {
    if (text == null)
        return;// w w  w  .j  a  v  a 2 s.  c om
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    ts = new FieldContentPopulateFilter(fieldContent, ts);
    try {
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void populate(String text, List<TokenTerm> tokenTerms) throws IOException {
    if (text == null)
        return;//from   w  w w.  j a v a 2 s.c o m
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    ts = new TokenTermPopulateFilter(tokenTerms, ts);
    try {
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jamespot.glifpix.index.ResourceDocument.java

License:Open Source License

private void addLiteralField(String literal) throws IOException {
    _luceneDocument/*from  www  . j  a v  a2  s  . co  m*/
            .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS));

    String coolLiteral = literal.replaceAll("\\\"", "");
    coolLiteral = replaceUnicodeStr(coolLiteral);

    Analyzer resAnalyzer = new ContentAnalyzer();
    TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral));

    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    int length = 0;
    StringBuffer sb = new StringBuffer();
    while (ts.incrementToken()) {
        sb.append("_" + termAttribute.term());
        length++;
    }
    sb.insert(0, length);
    _resourceLength = length;
    ts.end();
    ts.close();

    String finalToken = sb.toString();
    _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
    _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Integer> getTagsFreq(String content, String lng) {

    Map<String, Integer> items = new HashMap<String, Integer>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/*  w  w  w  . j a  v a2s .  co m*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag) + s.getValue());
                        } else {
                            items.put(tag, s.getValue());
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Float> getWeightedTagsFreq(String content, String lng) {

    Map<String, Float> items = new HashMap<String, Float>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/* www . j av a2 s . com*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag)
                                    + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        } else {
                            items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Set<String> getTokens(String content, String lng) {

    Set<String> tokens = new HashSet<String>();
    TokensArray tokArray = new TokensArray(15);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {//from   w w  w  . j a v  a2s .com
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    tokens.add(s.getKey());
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return tokens;
}

From source file:com.leavesfly.lia.advsearching.SpanQueryTest.java

License:Apache License

private void dumpSpans(SpanQuery query) throws IOException {
    Spans spans = query.getSpans(reader);
    System.out.println(query + ":");
    int numSpans = 0;

    TopDocs hits = searcher.search(query, 10);
    float[] scores = new float[2];
    for (ScoreDoc sd : hits.scoreDocs) {
        scores[sd.doc] = sd.score;//from  w  ww .ja va2  s  .c  om
    }

    while (spans.next()) { // A
        numSpans++;

        int id = spans.doc();
        Document doc = reader.document(id); // B

        TokenStream stream = analyzer.tokenStream("contents", // C
                new StringReader(doc.get("f"))); // C
        TermAttribute term = stream.addAttribute(TermAttribute.class);

        StringBuilder buffer = new StringBuilder();
        buffer.append("   ");
        int i = 0;
        while (stream.incrementToken()) { // D
            if (i == spans.start()) { // E
                buffer.append("<"); // E
            } // E
            buffer.append(term.term()); // E
            if (i + 1 == spans.end()) { // E
                buffer.append(">"); // E
            } // E
            buffer.append(" ");
            i++;
        }
        buffer.append("(").append(scores[id]).append(") ");
        System.out.println(buffer);
    }

    if (numSpans == 0) {
        System.out.println("   No spans");
    }
    System.out.println();
}

From source file:com.leavesfly.lia.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    TermAttribute term = stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] "); // B
    }//  w ww  .j  a  va 2 s .  c  o m
}