Example usage for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:com.jamespot.glifpix.index.ResourceDocument.java

License:Open Source License

private void addLiteralField(String literal) throws IOException {
    _luceneDocument/* ww  w  . j  a v a  2s. c o  m*/
            .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS));

    String coolLiteral = literal.replaceAll("\\\"", "");
    coolLiteral = replaceUnicodeStr(coolLiteral);

    Analyzer resAnalyzer = new ContentAnalyzer();
    TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral));

    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    int length = 0;
    StringBuffer sb = new StringBuffer();
    while (ts.incrementToken()) {
        sb.append("_" + termAttribute.term());
        length++;
    }
    sb.insert(0, length);
    _resourceLength = length;
    ts.end();
    ts.close();

    String finalToken = sb.toString();
    _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
    _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Integer> getTagsFreq(String content, String lng) {

    Map<String, Integer> items = new HashMap<String, Integer>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/*from   w  w w  .  j  a  v  a2 s  .  c o m*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag) + s.getValue());
                        } else {
                            items.put(tag, s.getValue());
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Map<String, Float> getWeightedTagsFreq(String content, String lng) {

    Map<String, Float> items = new HashMap<String, Float>();
    TokensArray tokArray = new TokensArray(_MaxExpressionLength);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {//  w  ww . j  a v  a 2  s . c  om
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    String tag = _resStores.get(lng).getTag(s.getKey());
                    if (tag != null && tag.length() >= _MinWordLength) {
                        if (items.containsKey(tag)) {
                            items.put(tag, items.get(tag)
                                    + (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        } else {
                            items.put(tag, (s.getValue().floatValue()) * getTagWeight(s.getKey(), lng));
                        }
                    }
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return items;
}

From source file:com.jamespot.glifpix.library.TagsExtractorImpl.java

License:Open Source License

public Set<String> getTokens(String content, String lng) {

    Set<String> tokens = new HashSet<String>();
    TokensArray tokArray = new TokensArray(15);

    TokenStream ts = _contentAnalyzer.tokenStream("dummyField", new StringReader(content));
    TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

    try {/*from  w  w w .  ja va 2s .c  om*/
        while (ts.incrementToken()) {
            tokArray.pushString(termAttribute.term());
            Map<String, Integer> tagCandidates = tokArray.check(_resStores.get(lng).getCRCs(),
                    _lngStopTags.get(lng));

            if (tagCandidates.size() > 0) {
                for (Map.Entry<String, Integer> s : tagCandidates.entrySet()) {
                    tokens.add(s.getKey());
                }
            }
        }
        ts.end();
        ts.close();

    } catch (IOException e) {
        logger.error(e);
    }

    return tokens;
}

From source file:com.lorelib.analyzer.sample.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?smart??//from   w w  w  .  j  av a  2  s  .c o m
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:com.mathworks.xzheng.analysis.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));

    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.toString());
    }//from   w w  w  .j  a  v  a2s .  c o m
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}

From source file:com.mhs.qsol.proximity.ProximityVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query./* ww  w .  j a  v  a2 s.c  o  m*/
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {
    if (logger.isLoggable(Level.FINE)) {
        // logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    if (logger.isLoggable(Level.FINE)) {
        logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        return null;
    } else if (v.size() == 1) {
        t = v.get(0);
        SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        stq.setBoost(this.boost);
        return stq;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                SpanQuery[] spanQueries = new SpanQuery[v.size()];

                StringBuilder regex = new StringBuilder();

                for (int i = 0; i < v.size(); i++) {
                    spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString()));
                }

                return new SpanOrQuery(spanQueries);
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length())));
            }

            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.mhs.qsol.QsolToQueryVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query.//w w w  . ja  v  a  2  s .  co m
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        // null's will get cleaned up in visitBooleanOp
        return null;
    } else if (v.size() == 1) {

        t = v.get(0);

        TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        termQuery.setBoost(this.boost);

        return termQuery;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                BooleanQuery q = new BooleanQuery(true);

                for (int i = 0; i < v.size(); i++) {
                    t = v.get(i);

                    TermQuery currentQuery = new TermQuery(
                            new Term(field, new String(t.buffer(), 0, t.length())));
                    currentQuery.setBoost(this.boost);

                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }

                return q;
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                SpanTermQuery spanQuery = new SpanTermQuery(
                        new Term(field, new String(t2.buffer(), 0, t2.length())));
                spanQuery.setBoost(boost);
                clauses[i] = spanQuery;
            }

            // Note: There's a bug here (not by me) that where term offsets are not respected.
            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.nec.scg.senseRanking.CountTextSimilarity.java

public Map<String, Float> CountTF_IDF(String str, Analyzer a) {
    Map<String, Float> termVector = new TreeMap<String, Float>();

    try {//w  w  w. j  a va  2  s  . c om
        TokenStream stream = a.tokenStream("content", new StringReader(str));
        PorterStemFilter filter = new PorterStemFilter(stream);
        CharTermAttribute cta = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        String strcat = null;
        int wordCount = 0;
        while (filter.incrementToken()) {
            strcat = cta.toString();
            // System.out.print("["+strcat+"]");
            if (!termVector.containsKey(strcat)) {
                termVector.put(strcat, 1f);
                wordCount++;
            } else {
                termVector.put(strcat, termVector.get(strcat) + 1);
                wordCount++;
            }
        }
        for (String ter : termVector.keySet()) {
            int hits = searchIndexforIDF(ter) + 1;
            float idf = (float) (Math.log(AllArticle * 1.0 / hits) + 1.0);
            float tf = termVector.get(ter) / wordCount;
            termVector.put(ter, tf * idf);
        }

        filter.end();
        stream.end();
        filter.close();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return termVector;
}

From source file:com.plug.Version_8_5_2.gs.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string using
 * GsAnalyzer./*from w w w  .  j ava  2s . com*/
 * 
 * @param p_text
 *            fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    // GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    // org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}