Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void extractTerms(String text, Collection<String> termSet) throws IOException {
    if (text == null)
        return;/* w w  w.  j  a  va2s .  c o m*/
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    try {
        ts = new TermSetTokenFilter(termSet, ts);
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void populate(String text, ResultNamedEntityExtraction result) throws IOException {
    if (text == null)
        return;/*  w  w w .  j a  v  a2 s.co m*/
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    ts = new NamedEntityPopulateFilter(result, ts);
    try {
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void populate(String text, FieldContent fieldContent) throws IOException {
    if (text == null)
        return;// w w  w  .j  a  v  a 2 s.  c om
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    ts = new FieldContentPopulateFilter(fieldContent, ts);
    try {
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jaeksoft.searchlib.analysis.CompiledAnalyzer.java

License:Open Source License

public void populate(String text, List<TokenTerm> tokenTerms) throws IOException {
    if (text == null)
        return;//from   w  w w.  j a v a 2 s.c o m
    StringReader reader = new StringReader(text);
    TokenStream ts = tokenStream(null, reader);
    ts = new TokenTermPopulateFilter(tokenTerms, ts);
    try {
        while (ts.incrementToken())
            ;
    } finally {
        IOUtils.closeQuietly(ts);
    }
}

From source file:com.jamespot.glifpix.index.ResourceDocument.java

License:Open Source License

private void addLiteralField(String literal) throws IOException {
    _luceneDocument/*from  www  . j  a v  a2  s  . co  m*/
            .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS));

    String coolLiteral = literal.replaceAll("\\\"", "");
    coolLiteral = replaceUnicodeStr(coolLiteral);

    Analyzer resAnalyzer = new ContentAnalyzer();
    TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral));

    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    int length = 0;
    StringBuffer sb = new StringBuffer();
    while (ts.incrementToken()) {
        sb.append("_" + termAttribute.term());
        length++;
    }
    sb.insert(0, length);
    _resourceLength = length;
    ts.end();
    ts.close();

    String finalToken = sb.toString();
    _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
    _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Integer> getTagsFreq(String content, String lng) {

    Map<String, Integer> items = new HashMap<String, Integer>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/*  w  w  w  . j a  v a2s .  co m*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag) + s.getValue());
                        } else {
                            items.put(tag, s.getValue());
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Float> getWeightedTagsFreq(String content, String lng) {

    Map<String, Float> items = new HashMap<String, Float>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/* www . j av a2 s . com*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag)
                                    + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        } else {
                            items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Set<String> getTokens(String content, String lng) {

    Set<String> tokens = new HashSet<String>();
    TokensArray tokArray = new TokensArray(15);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {//from   w w  w  . j a v  a2s .com
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    tokens.add(s.getKey());
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return tokens;
}

From source file:com.leavesfly.lia.advsearching.SpanQueryTest.java

License:Apache License

private void dumpSpans(SpanQuery query) throws IOException {
    Spans spans = query.getSpans(reader);
    System.out.println(query + ":");
    int numSpans = 0;

    TopDocs hits = searcher.search(query, 10);
    float[] scores = new float[2];
    for (ScoreDoc sd : hits.scoreDocs) {
        scores[sd.doc] = sd.score;//from  w  ww .ja va2  s  .c  om
    }

    while (spans.next()) { // A
        numSpans++;

        int id = spans.doc();
        Document doc = reader.document(id); // B

        TokenStream stream = analyzer.tokenStream("contents", // C
                new StringReader(doc.get("f"))); // C
        TermAttribute term = stream.addAttribute(TermAttribute.class);

        StringBuilder buffer = new StringBuilder();
        buffer.append("   ");
        int i = 0;
        while (stream.incrementToken()) { // D
            if (i == spans.start()) { // E
                buffer.append("<"); // E
            } // E
            buffer.append(term.term()); // E
            if (i + 1 == spans.end()) { // E
                buffer.append(">"); // E
            } // E
            buffer.append(" ");
            i++;
        }
        buffer.append("(").append(scores[id]).append(") ");
        System.out.println(buffer);
    }

    if (numSpans == 0) {
        System.out.println("   No spans");
    }
    System.out.println();
}

From source file:com.leavesfly.lia.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {

    TermAttribute term = stream.addAttribute(TermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term.term() + "] "); // B
    }//  w ww  .j  a  va 2 s .  c  o m
}