Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:hivemall.nlp.tokenizer.KuromojiUDF.java

License:Apache License

private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }//from   w  w  w. jav a2 s  . co  m
}

From source file:ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*from  ww  w .  ja v a 2s  . co  m*/
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private List<String> tokenize(String query, Analyzer analyzer) throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute charTermAttr = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
    List<String> tokens = new ArrayList<String>();

    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        String term = charTermAttr.toString();
        String text = term;//from  w  w w  .j a va  2 s  .  c om
        if (text != null)
            tokens.add(term);
    }
    return tokens;
}

From source file:indexer.IndexPrinter.java

String getAnalyzedContent(String content) throws IOException {
    StringBuffer tokenizedContentBuff = new StringBuffer();
    Analyzer analyzer = new StandardAnalyzer();
    TokenStream stream = analyzer.tokenStream(TextDocIndexer.FIELD_ANALYZED_CONTENT, new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokenizedContentBuff.append(term).append(" ");
    }/*  w  w  w  . jav a2s.  c o m*/
    tokenizedContentBuff.append("\n");
    return tokenizedContentBuff.toString();
}

From source file:indexer.LineDocumentIndexer.java

Document constructDoc(FileWriter fw, String id, String line) throws Exception {

    Document doc = new Document();
    doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED));

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();//  w  w  w  .  j  a  v a 2s .  c  o m
        tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    tokenizedContentBuff.append("\n");
    fw.write(id + "\t" + tokenizedContentBuff.toString());

    // Reanalyze
    doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    return doc;
}

From source file:indexer.Paragraph.java

List<Paragraph> constructParagraphs(int docId, String content) throws Exception {
    List<Paragraph> parList = new ArrayList<>();

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    int count = 0;
    int id = 0;//from   ww w .ja  v  a2s  . c o  m
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokens.add(term);
        count++;
        if (count == paraWindowSize) {
            // create a paragraph
            Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
            tokens.clear();
            count = 0;
            parList.add(p);
        }
    }
    if (count > 0) {
        Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
        parList.add(p);
    }

    stream.end();
    stream.close();

    return parList;
}

From source file:indexer.WordVecSequenceFileGenerator.java

String embedWords(Document d) throws Exception {
    String content = d.get(AMI_FIELDS.FIELD_CONTENT);
    int decScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_DECISION_SCORE)) > 0 ? 1 : 0;
    int prefScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_PREF_SCORE)) > 0 ? 1 : 0;

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    StringBuffer buff = new StringBuffer();
    boolean labelsStoredWithWords = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));

    while (stream.incrementToken()) {
        String term = termAtt.toString().toLowerCase();
        String[] wordAndLabel = null;

        if (labelsStoredWithWords) {
            wordAndLabel = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM);
            term = wordAndLabel[0]; // the first part is the word
            decScore = Integer.parseInt(wordAndLabel[1]);
            prefScore = Integer.parseInt(wordAndLabel[2]);
        }/*from  w  w  w .  ja v a2 s .com*/

        double[] x = wvecs.getWordVector(term);
        if (x == null) {
            System.err.println("No vec found for word " + term);
            continue;
        }

        String wvec = vecToStr(x);
        if (decScore > 1)
            decScore = 1;
        if (prefScore > 1)
            prefScore = 1;
        buff.append(wvec).append("\t").append(decScore).append("\t").append(prefScore).append("\n");
    }
    stream.close();

    return buff.toString();
}

From source file:info.johtani.elasticsearch.action.admin.indices.extended.analyze.TransportExtendedAnalyzeAction.java

License:Apache License

private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream,
        Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset)
        throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }/*w  w  w  .j a v  a 2 s  . c om*/

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition,
                lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(),
                extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}

From source file:info.johtani.jjug.lucene.sample.TokenizeSample.java

License:Apache License

private static void printToken(String text, Analyzer analyzer) {
    System.out.println("--- Original: [" + text + "]");
    try {/*w  w w  .  j a v  a2s  .c om*/
        TokenStream tokens = analyzer.tokenStream("content", text);
        tokens.reset();
        CharTermAttribute termAttr = tokens.getAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            System.out.println("[" + termAttr.toString() + "]");
        }
        tokens.reset();
    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:io.anserini.analysis.TweetTokenizationTest.java

License:Apache License

public List<String> parseKeywords(Analyzer analyzer, String keywords) throws IOException {
    List<String> list = new ArrayList<>();

    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(keywords));
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        if (cattr.toString().length() == 0) {
            continue;
        }/*  ww w .  j  a  v a  2s  .  c om*/
        list.add(cattr.toString());
    }
    tokenStream.end();
    tokenStream.close();

    return list;
}