Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:indexing.ReviewTextAnalyzer.java

License:Open Source License

/**
 * @param args/*from  w  ww .  j ava 2  s  . c  o  m*/
 */
public static void main(String[] args) {
    ReviewTextAnalyzer r = new ReviewTextAnalyzer(new ReviewDocumentIndexer());
    String[] filenames = { "review.txt" };
    for (String filename : filenames) {
        try {
            TokenStream tokstr = r.reusableTokenStream(null, new FileReader(filename));

            TermAttribute output_term = tokstr.addAttribute(TermAttribute.class);
            TypeAttribute output_type = tokstr.addAttribute(TypeAttribute.class);
            FlagsAttribute output_flags = tokstr.addAttribute(FlagsAttribute.class);
            PayloadAttribute output_payload = tokstr.addAttribute(PayloadAttribute.class);

            int review_id = r.indexer.theReviewId.get() + 1;
            r.indexer.theReviewId.set(review_id);
            r.indexer.theStats.setCurrent(review_id, 10);

            while (tokstr.incrementToken()) {

                Token current_token = new Token(output_term.term(), output_type.type(), output_flags.getFlags(),
                        new ReviewTermPayload(output_payload.getPayload()));

                System.out.print(current_token);

                if (current_token.isDelim(false)) {
                    System.out.println();
                }
                if (current_token.isDelim(true)) {
                    System.out.println("..................................................................\n");
                }
            }

            System.out.println();

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        System.out.println(
                "\n\n\n\n\n\n\n\n==================================================================\n\n\n\n\n\n\n\n");
    }

    return;
}

From source file:info.johtani.elasticsearch.action.admin.indices.extended.analyze.TransportExtendedAnalyzeAction.java

License:Apache License

private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream,
        Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset)
        throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();/*from   www  .  j  a  va  2 s.  c  o m*/

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition,
                lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(),
                extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}

From source file:info.johtani.jjug.lucene.sample.TokenizeSample.java

License:Apache License

private static void printToken(String text, Analyzer analyzer) {
    System.out.println("--- Original: [" + text + "]");
    try {/* w  ww  . jav a  2 s.  c  o m*/
        TokenStream tokens = analyzer.tokenStream("content", text);
        tokens.reset();
        CharTermAttribute termAttr = tokens.getAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            System.out.println("[" + termAttr.toString() + "]");
        }
        tokens.reset();
    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:io.anserini.analysis.TweetTokenizationTest.java

License:Apache License

public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
    List<String> list = new ArrayList<>();

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*from ww w .ja  v  a  2  s  .c o  m*/
    while (tokenStream.incrementToken()) {
        if (cattr.toString().length() == 0) {
            continue;
        }
        list.add(cattr.toString());
    }
    tokenStream.end();
    tokenStream.close();

    return list;
}

From source file:io.bdrc.lucene.bo.TibetanAnalyzerTest.java

License:Apache License

static private void assertOffsets(String inputStr, TokenStream tokenStream, List<String> expected) {
    try {//from w w w .j a  v  a 2s .com
        List<String> termList = new ArrayList<String>();
        // CharTermAttribute charTermAttribute =
        // tokenStream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
        while (tokenStream.incrementToken()) {
            int start = offsetAttr.startOffset();
            int end = offsetAttr.endOffset();
            termList.add(inputStr.substring(start, end));
        }
        System.out.println(String.join(" ", termList));
        assertThat(termList, is(expected));
    } catch (IOException e) {
        assertTrue(false);
    }
}

From source file:io.bdrc.lucene.bo.TibetanAnalyzerTest.java

License:Apache License

static private void assertTokenStream(TokenStream tokenStream, List<String> expected) {
    try {//from  w  w w.  jav  a  2  s  .  c  o  m
        List<String> termList = new ArrayList<String>();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            termList.add(charTermAttribute.toString());
        }
        System.out.println(String.join(" ", termList));
        assertThat(termList, is(expected));
    } catch (IOException e) {
        assertTrue(false);
    }
}

From source file:io.bdrc.lucene.bo.TibetanAnalyzerTest.java

License:Apache License

@Test
public void ewtsFilterTest() throws IOException {
    System.out.println("Testing TibEwtsFilter()");
    String input = "bod rgyal lo invalid ";
    Reader reader = new StringReader(input);
    List<String> expected = Arrays.asList("", "", "", "", "",
            "");/*from  w w w. ja v  a 2  s .c  o  m*/
    System.out.print(input + " => ");
    TokenStream res = tokenize(new TibEwtsFilter(reader, "ewts"), new TibSyllableTokenizer());
    assertTokenStream(res, expected);
    // long string, provoked a bug
    input = "de'i sprul sku yi ngogs chos rje dge 'dun mkhas grub ni mkhan chen dge 'dun rgya mtsho'i gyi sku tshar chu khyi (1742) lor 'khrungs/ rje de nyid las bcu gsum pa la rab tu byung/ dgon chen du mdo sngags la sbyangs/ rig gnas thams cad la mkhas/ nyer gcig pa chu sprel la smon lam rab 'byams pa mdzad/ kun mkhyen bar mas mgo 'dren mdzad de lcang skya rin po chen nas chos rje'i cho lo gnang/ mkhan chen gshegs par dngul srang stong dang nyis brgyas mchod rten bzhengs/ lcags byi lor rgyud khrir bzhugs/ bde mchog yi dam mdzad/ gsung rtsom yang 'ga' zhig snang/ bdun cu pa lcags byi  (mdo smad chos 'byung du bdun cu pa lcags byi lor gshegs pa zer ba lo grangs dang lo snying thod mi thug pa dpyad gzhi ru sor bzhag byas pa) lor gshegs/ de'i sprul sku dge 'dun yon tan rgya mtsho chos srid kyi mkhyen rgya che zhing rgyud pa'i khri mdzad/ de'i sprul sku yi ngogs nas 'khrungs pa dkon mchog rgyal mtshan da lta bzhugs";
    reader = new StringReader(input);
    res = tokenize(new TibEwtsFilter(reader), new TibSyllableTokenizer());
    while (res.incrementToken()) {
    } // with trigger the exception in case of a bug
      // dts
    input = "dul-ba rnam-par-byed-pa";
    reader = new StringReader(input);
    expected = Arrays.asList("", "", "", "", "", "");
    System.out.print(input + " => ");
    res = tokenize(new TibEwtsFilter(reader, "dts"), new TibSyllableTokenizer());
    assertTokenStream(res, expected);
    // alalc
    input = "Ri-gi-?-ra";
    reader = new StringReader(input);
    expected = Arrays.asList("", "", "", "");
    System.out.print(input + " => ");
    res = tokenize(new TibEwtsFilter(reader, "alalc"), new TibSyllableTokenizer());
    // assertTokenStream(res, expected); // commented to build
}

From source file:it.cnr.ilc.lc.clavius.search.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, String idDoc,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }//  www .  jav a  2  s .c o m

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        // log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //  log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(idDoc);
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setIdNeo4j(-1l);
                a.setPageNum(-1l);
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.cnr.ilc.lc.claviusweb.fulltextsearch.ClaviusHighlighter.java

public final List<Annotation> getBestTextClaviusFragments(TokenStream tokenStream, Document document,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {

    List<Annotation> ret = new ArrayList<>();

    ArrayList<ClaviusTextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    Scorer fragmentScorer = getFragmentScorer();
    Fragmenter textFragmenter = getTextFragmenter();
    int maxDocCharsToAnalyze = getMaxDocCharsToAnalyze();
    Encoder encoder = getEncoder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    ClaviusTextFragment currentFrag = new ClaviusTextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
    }//from  w  w w.j  ava2  s  . co m

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    //        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        //textFragmenter.start(text, tokenStream);

        ClaviusTokenGroup tokenGroup = new ClaviusTokenGroup(tokenStream);

        tokenStream.reset();
        //log.info("tokenGroup.getNumTokens() A: " + tokenGroup.getNumTokens());

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {

            //                if ((offsetAtt.endOffset() > text.length())
            //                        || (offsetAtt.startOffset() > text.length())) {
            //                    throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
            //                            + " exceeds length of provided text sized " + text.length());
            //                }
            //  log.info("newText: A (" + newText.toString() + "), fragmentScorer.getTokenScore()("+fragmentScorer.getTokenScore()+")");
            tokenGroup.addToken(fragmentScorer.getTokenScore());

        } // END FOR
          //log.info("tokenGroup.getNumTokens() B: " + tokenGroup.getNumTokens());

        for (int i = 0; i < tokenGroup.getNumTokens(); i++) {
            //log.info("tokenGroup[" + i + "]: token: " + tokenGroup.getToken(i) + ", score: " + tokenGroup.getScore(i));
            if (tokenGroup.getScore(i) > 0) {
                Annotation a = new Annotation();
                a.setMatched(tokenGroup.getToken(i).toString());
                a.setIdDoc(document.get("idDoc"));
                //contesto sinistro
                Token[] t = Arrays.copyOfRange(tokenGroup.getTokens(), (i > ctxLenght) ? i - ctxLenght : 0, i);
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setLeftContext(sb.toString());
                sb.setLength(0);
                //contesto destro
                t = Arrays.copyOfRange(tokenGroup.getTokens(), i + 1,
                        (i + ctxLenght + 1 < tokenGroup.getNumTokens() ? i + ctxLenght + 1
                                : tokenGroup.getNumTokens()));
                sb = new StringBuilder();
                for (int j = 0; j < t.length; j++) {
                    sb.append(t[j].toString());
                    if (j < t.length - 1) {
                        sb.append(" ");
                    }
                }
                a.setRightContext(sb.toString());

                a.setConcept("");
                a.setType("");
                a.setPageNum(-1l);
                // a.setIdNeo4j(Long.parseLong(document.get("idNeo4j")));
                a.setIdNeo4j(Long.parseLong(document.get("idDoc")));
                a.setResourceObject("");
                a.setId(-1l);

                ret.add(a);
            }
        }

        return ret;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String str = "<body>perch";
    Analyzer anal = new DexterAnalyzer();
    TokenStream ts = anal.tokenStream("content", new StringReader(str));

    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  w  ww . ja  v a2s .co m
    while (ts.incrementToken()) {
        System.out.println(termAtt.toString().substring(0, termAtt.length()));
        System.out.println("token start offset: " + offsetAtt.startOffset());
        System.out.println("  token end offset: " + offsetAtt.endOffset());
    }
}