Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:edu.virginia.cs.utility.StringTokenizer.java

/**
 * Method that generates list of tokens from the parameter string.
 *
 * @param string//from  ww  w .j a va 2s  . c om
 * @return list of tokens generated
 */
public List<String> TokenizeString(String string) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:elhuyar.bilakit.PayloadQParserPlugin.java

License:Open Source License

@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError {
    SchemaField sf = this.schema.getFieldOrNull(field);
    if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) {
        //analyze queryText
        List<String> result = new ArrayList<String>();
        try {/*from  w  w  w.  ja  v a  2s  . c o m*/
            TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            //    not thrown b/c we're using a string reader...
            throw new RuntimeException(e);
        }
        String analyzedqueryText = "";
        analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " ");
        queryText = analyzedqueryText;
        // Note that this will work for any field defined with the
        //    <fieldType> of "*_payloads"
        Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true);

        return plter;

    }
    return super.getFieldQuery(field, queryText, quoted);
}

From source file:ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*from   w ww  .  j a v  a  2 s  .c  o m*/
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:indexer.LineDocumentIndexer.java

Document constructDoc(FileWriter fw, String id, String line) throws Exception {

    Document doc = new Document();
    doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED));

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();// w ww  .ja v a 2s  . c  o m

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    tokenizedContentBuff.append("\n");
    fw.write(id + "\t" + tokenizedContentBuff.toString());

    // Reanalyze
    doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    return doc;
}

From source file:indexer.Paragraph.java

List<Paragraph> constructParagraphs(int docId, String content) throws Exception {
    List<Paragraph> parList = new ArrayList<>();

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/* ww  w. j a  va  2  s  .c o m*/

    int count = 0;
    int id = 0;
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokens.add(term);
        count++;
        if (count == paraWindowSize) {
            // create a paragraph
            Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
            tokens.clear();
            count = 0;
            parList.add(p);
        }
    }
    if (count > 0) {
        Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
        parList.add(p);
    }

    stream.end();
    stream.close();

    return parList;
}

From source file:info.johtani.elasticsearch.action.admin.indices.extended.analyze.TransportExtendedAnalyzeAction.java

License:Apache License

private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream,
        Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset)
        throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();/*from  ww w. jav a 2 s .  c  om*/

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition,
                lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(),
                extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}

From source file:io.anserini.analysis.TweetTokenizationTest.java

License:Apache License

public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
    List<String> list = new ArrayList<>();

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*from   ww  w.ja va  2 s.c o m*/
    while (tokenStream.incrementToken()) {
        if (cattr.toString().length() == 0) {
            continue;
        }
        list.add(cattr.toString());
    }
    tokenStream.end();
    tokenStream.close();

    return list;
}

From source file:it.cnr.ilc.lc.clavius.search.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, String idDoc,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }//from   www.j  a  v a 2  s.co  m

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        // log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //  log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(idDoc);
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setIdNeo4j(-1l);
                a.setPageNum(-1l);
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.cnr.ilc.lc.claviusweb.fulltextsearch.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, Document document,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }/*w ww  .j  av  a 2  s.c om*/

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        //log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(document.get("idDoc"));
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setPageNum(-1l);
                // a.setIdNeo4j(Long.parseLong(document.get("idNeo4j")));
                a.setIdNeo4j(Long.parseLong(document.get("idDoc")));
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java

License:Apache License

public String clean(String spot) throws IOException {
    try {// w  w  w  .  j a v a  2s.co m
        spot = URLDecoder.decode(spot, "UTF-8");
    } catch (IllegalArgumentException e) {

    }

    analyzer.lowercase(spot.length() > 4);

    TokenStream ts = analyzer.tokenStream("content", new StringReader(spot));

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    sb.setLength(0);
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
        sb.append(termAtt.toString());
        sb.append(' ');
        if (tokens > maxSpotLength) {
            return "";
        }
    }
    ts.end();
    ts.reset();
    if (sb.length() > 0)
        sb.setLength(sb.length() - 1);
    // System.out.println(spot + " -> " + "[" + sb.toString() + "]");
    String finalSpot = sb.toString();
    for (Filter<String> filter : filters) {
        if (filter.isFilter(finalSpot)) {
            finalSpot = "";
        }
    }
    return finalSpot;
}