Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:io.anserini.analysis.TweetTokenizationTest.java

License:Apache License

public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
    List<String> list = new ArrayList<>();

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();//from w  w  w . jav a2 s.  com
    while (tokenStream.incrementToken()) {
        if (cattr.toString().length() == 0) {
            continue;
        }
        list.add(cattr.toString());
    }
    tokenStream.end();
    tokenStream.close();

    return list;
}

From source file:it.cnr.ilc.lc.clavius.search.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, String idDoc,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }/* w ww. ja v  a  2s . c  om*/

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        // log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //  log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(idDoc);
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setIdNeo4j(-1l);
                a.setPageNum(-1l);
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.cnr.ilc.lc.claviusweb.fulltextsearch.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, Document document,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }/*from   w  ww .j a v a 2 s. c  o m*/

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        //log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(document.get("idDoc"));
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setPageNum(-1l);
                // a.setIdNeo4j(Long.parseLong(document.get("idNeo4j")));
                a.setIdNeo4j(Long.parseLong(document.get("idDoc")));
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.unibz.instasearch.indexing.StorageIndexer.java

License:Open Source License

/**
 * Extracts terms from text//  ww  w .j av  a2s. c  om
 * 
 * @param text
 * @return a map of terms to their offsets in text
 * @throws IOException
 */
public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException {
    Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>();
    TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text));

    TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);

    while (tokenStream.incrementToken()) {
        String termText = termAtt.term().toLowerCase();// t.termText().toLowerCase();
        int offset = offsetAtt.startOffset();

        List<Integer> offsets = terms.get(termText);

        if (offsets == null) {
            offsets = new LinkedList<Integer>();
            terms.put(termText, offsets);
        }

        offsets.add(offset);
    }
    tokenStream.close();

    return terms;
}

From source file:jaligner.Sequence.java

License:Open Source License

/**
 * Constructor/*w  w w  . j av a 2 s. com*/
 * 
 * @param sequence
 */
public Sequence(String sequence, Analyzer analyzer, int max_length) throws IOException {
    super();
    this.sequence = sequence;

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(sequence));
    Token.TokenAttributeFactory tokenAttributeFactory = new Token.TokenAttributeFactory(
            stream.getAttributeFactory());

    Vector<Token> tokenVector = new Vector<Token>();

    while (stream.incrementToken() && tokenVector.size() < max_length) {
        //            Token token = new Token();
        //            Token token = (Token) stream.getAttribute(CharTermAttribute.class);
        Token token = (Token) tokenAttributeFactory.createAttributeInstance(Token.class);

        CharTermAttribute charTerm = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        //            PayloadAttribute payload = stream.getAttribute(PayloadAttribute.class);
        //            FlagsAttribute flags = stream.getAttribute(FlagsAttribute.class);

        //        public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
        token.reinit(charTerm.buffer(), 0, charTerm.length(), offset.startOffset(), offset.endOffset());
        token.setOffset(offset.startOffset(), offset.endOffset());

        //            token.setPayload(payload.getPayload());
        //            token.setFlags(flags.getFlags());

        if (stream.hasAttribute(PositionIncrementAttribute.class)) {
            PositionIncrementAttribute positionIncrement = stream
                    .getAttribute(PositionIncrementAttribute.class);
            token.setPositionIncrement(positionIncrement.getPositionIncrement());
        }

        if (stream.hasAttribute(TypeAttribute.class)) {
            TypeAttribute type = stream.getAttribute(TypeAttribute.class);
            token.setType(type.type());
        }

        tokenVector.add(token);
    }

    stream.end();
    stream.close();

    this.tokens = tokenVector.toArray(new Token[tokenVector.size()]);
}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java

License:Apache License

private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
    if (!analyzeWildcard) {
        return super.getPrefixQuery(field, termStr);
    }/*from  w  w w  .ja  v a2  s  .co  m*/
    // get Analyzer from superclass and tokenize the term
    TokenStream source;
    try {
        source = getAnalyzer().tokenStream(field, termStr);
        source.reset();
    } catch (IOException e) {
        return super.getPrefixQuery(field, termStr);
    }
    List<String> tlist = new ArrayList<>();
    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken())
                break;
        } catch (IOException e) {
            break;
        }
        tlist.add(termAtt.toString());
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (tlist.size() == 1) {
        return super.getPrefixQuery(field, tlist.get(0));
    } else {
        // build a boolean query with prefix on each one...
        List<BooleanClause> clauses = new ArrayList<>();
        for (String token : tlist) {
            clauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
        }
        return getBooleanQuery(clauses, true);

        //return super.getPrefixQuery(field, termStr);

        /* this means that the analyzer used either added or consumed
        * (common for a stemmer) tokens, and we can't build a PrefixQuery */
        //            throw new ParseException("Cannot build PrefixQuery with analyzer "
        //                    + getAnalyzer().getClass()
        //                    + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
    }

}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.MapperQueryParser.java

License:Apache License

private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException {
    if (!analyzeWildcard) {
        return super.getWildcardQuery(field, termStr);
    }/*from  w ww  . j a v  a  2s  .c  om*/
    boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
    StringBuilder aggStr = new StringBuilder();
    StringBuilder tmp = new StringBuilder();
    for (int i = 0; i < termStr.length(); i++) {
        char c = termStr.charAt(i);
        if (c == '?' || c == '*') {
            if (isWithinToken) {
                try {
                    TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
                    source.reset();
                    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
                    if (source.incrementToken()) {
                        String term = termAtt.toString();
                        if (term.length() == 0) {
                            // no tokens, just use what we have now
                            aggStr.append(tmp);
                        } else {
                            aggStr.append(term);
                        }
                    } else {
                        // no tokens, just use what we have now
                        aggStr.append(tmp);
                    }
                    source.close();
                } catch (IOException e) {
                    aggStr.append(tmp);
                }
                tmp.setLength(0);
            }
            isWithinToken = false;
            aggStr.append(c);
        } else {
            tmp.append(c);
            isWithinToken = true;
        }
    }
    if (isWithinToken) {
        try {
            TokenStream source = getAnalyzer().tokenStream(field, tmp.toString());
            source.reset();
            CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
            if (source.incrementToken()) {
                String term = termAtt.toString();
                if (term.length() == 0) {
                    // no tokens, just use what we have now
                    aggStr.append(tmp);
                } else {
                    aggStr.append(term);
                }
            } else {
                // no tokens, just use what we have now
                aggStr.append(tmp);
            }
            source.close();
        } catch (IOException e) {
            aggStr.append(tmp);
        }
    }

    return super.getWildcardQuery(field, aggStr.toString());
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java

License:Apache License

@Test
public void testNullSynonyms() throws Exception {
    Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1);
    TokenStream stream = a.tokenStream("f", new StringReader(""));
    stream.reset();/*  w  ww .  j  a v  a2 s  .co  m*/
    assertTokenStream(stream, ",0,1,1/,1,2,1/,2,3,1/,3,4,1/,4,5,1/,5,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(2);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,2,1/,1,3,1/,2,4,1/,3,5,1/,4,6,1");
    stream.close();
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,1,1");
    stream.close();
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,2,1");

    a = new NGramSynonymTokenizerTestAnalyzer(3);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,3,1/,1,4,1/,2,5,1/,3,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(4);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,4,1/,1,5,1/,2,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(5);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,5,1/,1,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(6);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(7);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");

    a = new NGramSynonymTokenizerTestAnalyzer(8);
    stream = a.tokenStream("f", new StringReader(""));
    stream.reset();
    assertTokenStream(stream, ",0,6,1");
}

From source file:kafka.examples.Producer.java

License:Apache License

public void run() {
    while (true) {
        String access_token = "2.009F1d9BmHHChD7abcd6de0a0jui5Y";
        int count = 20;
        Timeline tm = new Timeline(access_token);
        Analyzer analyzer4 = new IKAnalyzer(false);// ?

        try {//  w  w  w .  ja  v  a  2s  .co m
            StatusWapper status = tm.getPublicTimeline(count, 0);
            //-------------------------------------------
            try {
                TokenStream tokenstream = analyzer4.tokenStream("", new StringReader(status.toString()));
                CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);// token

                tokenstream.reset();// ?

                while (tokenstream.incrementToken()) {// ??token
                    String prTxt = new String(termAttribute.buffer(), 0, termAttribute.length());
                    //producer.send(new KeyedMessage<Integer, String>(topic, ptTxt + " "));
                    System.out.print(prTxt + "  ");
                }
                //System.out.println();
                tokenstream.close();//TokenStream
            } catch (IOException e) {
                e.printStackTrace();
            }
            //-------------------------------------------
            producer.send(new KeyedMessage<Integer, String>(topic, status.toString()));
            Log.logInfo(status.toString());

        } catch (WeiboException e) {
            e.printStackTrace();
        }
    }
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));

    TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.getBytesRef().utf8ToString());
    }/* w  w w  .j av a  2 s  .  c  om*/
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}