Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:fr.lipn.yasemir.weighting.ckpd.TermFactory.java

License:Open Source License

public static Vector<NGramTerm> makeTermSequence(String text) {
    List<String> result = new ArrayList<String>();
    try {/*from w  w w.  ja v  a2s. c  om*/
        TokenStream stream = analyzer.tokenStream("text", new StringReader(text));

        while (stream.incrementToken()) {
            //result.add(stream.getAttribute(TermAttribute.class).term()); //old way
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
    }

    Vector<NGramTerm> ngtv = new Vector<NGramTerm>();
    for (String s : result) {
        NGramTerm t = new NGramTerm(s);
        ngtv.add(t);
    }
    return ngtv;
}

From source file:fry.future.plugin.example.APP.java

private static List<String> tokenString(Analyzer analyzer, String str) throws IOException {
    List<String> result = new ArrayList<>();
    TokenStream tokenStream = analyzer.tokenStream("Test", new StringReader(str));
    tokenStream.reset();//from w  w  w .jav a2 s  .com
    while (tokenStream.incrementToken()) {
        result.add(tokenStream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:gr.aueb.demo.PropertyRegistryBean.java

public static String removeStopWords(String textFile) {
    //CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    CharArraySet stopWords = PropertyRegistryBean.stopSet;
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(textFile.trim()));
    tokenStream = new StopFilter(Version.LUCENE_48, tokenStream, stopWords);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    try {/*from www. j a  v a 2 s .  com*/
        tokenStream.reset();
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    try {
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            sb.append(term + " ");
        }
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    return sb.toString();
}

From source file:hivemall.nlp.tokenizer.KuromojiUDF.java

License:Apache License

private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();//from w  w  w  . j a  va2 s  . c  o m

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }
}

From source file:ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*from  www  .  j av  a  2s  .  co  m*/
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private List<String> tokenize(String query, Analyzer analyzer) throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute charTermAttr = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
    List<String> tokens = new ArrayList<String>();

    tokenStream.reset();//w w  w . j a v a  2  s  .  c o m
    while (tokenStream.incrementToken()) {
        String term = charTermAttr.toString();
        String text = term;
        if (text != null)
            tokens.add(term);
    }
    return tokens;
}

From source file:indexer.IndexPrinter.java

String getAnalyzedContent(String content) throws IOException {
    StringBuffer tokenizedContentBuff = new StringBuffer();
    Analyzer analyzer = new StandardAnalyzer();
    TokenStream stream = analyzer.tokenStream(TextDocIndexer.FIELD_ANALYZED_CONTENT, new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from  www  .j ava 2  s .  c  o m*/

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokenizedContentBuff.append(term).append(" ");
    }
    tokenizedContentBuff.append("\n");
    return tokenizedContentBuff.toString();
}

From source file:indexer.LineDocumentIndexer.java

Document constructDoc(FileWriter fw, String id, String line) throws Exception {

    Document doc = new Document();
    doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED));

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*  ww w.  j  a  v a2s .c o m*/

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    tokenizedContentBuff.append("\n");
    fw.write(id + "\t" + tokenizedContentBuff.toString());

    // Reanalyze
    doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    return doc;
}

From source file:indexer.Paragraph.java

List<Paragraph> constructParagraphs(int docId, String content) throws Exception {
    List<Paragraph> parList = new ArrayList<>();

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from  ww w  . java2s  .  c  o m*/

    int count = 0;
    int id = 0;
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokens.add(term);
        count++;
        if (count == paraWindowSize) {
            // create a paragraph
            Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
            tokens.clear();
            count = 0;
            parList.add(p);
        }
    }
    if (count > 0) {
        Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
        parList.add(p);
    }

    stream.end();
    stream.close();

    return parList;
}

From source file:indexer.WordVecSequenceFileGenerator.java

String embedWords(Document d) throws Exception {
    String content = d.get(AMI_FIELDS.FIELD_CONTENT);
    int decScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_DECISION_SCORE)) > 0 ? 1 : 0;
    int prefScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_PREF_SCORE)) > 0 ? 1 : 0;

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from www.  j  av  a 2s .c  o  m*/

    StringBuffer buff = new StringBuffer();
    boolean labelsStoredWithWords = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));

    while (stream.incrementToken()) {
        String term = termAtt.toString().toLowerCase();
        String[] wordAndLabel = null;

        if (labelsStoredWithWords) {
            wordAndLabel = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM);
            term = wordAndLabel[0]; // the first part is the word
            decScore = Integer.parseInt(wordAndLabel[1]);
            prefScore = Integer.parseInt(wordAndLabel[2]);
        }

        double[] x = wvecs.getWordVector(term);
        if (x == null) {
            System.err.println("No vec found for word " + term);
            continue;
        }

        String wvec = vecToStr(x);
        if (decScore > 1)
            decScore = 1;
        if (prefScore > 1)
            prefScore = 1;
        buff.append(wvec).append("\t").append(decScore).append("\t").append(prefScore).append("\n");
    }
    stream.close();

    return buff.toString();
}