Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:fr.inrialpes.exmo.ontosim.VectorSpaceMeasure.java

License:Open Source License

/**
 * add all words contained in toAnalyse into words collection. Words are stemmed.
 * @param toAnalyse : the string to be analysed
 * @param words : the collection to add extracted words
 *//*  w w w  .ja  v a  2s . com*/
protected void analyseString(String toAnalyse, Collection<String> words) {
    TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse));
    TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class);
    try {
        while (tokenS.incrementToken()) {
            words.add(termAtt.term());
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:gr.aueb.demo.PropertyRegistryBean.java

public static String removeStopWords(String textFile) {
    //CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    CharArraySet stopWords = PropertyRegistryBean.stopSet;
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(textFile.trim()));
    tokenStream = new StopFilter(Version.LUCENE_48, tokenStream, stopWords);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    try {/*w ww  .  j  ava 2  s . c o m*/
        tokenStream.reset();
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    try {
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            sb.append(term + " ");
        }
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    return sb.toString();
}

From source file:ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {// www  .j  a v a2s. c o m
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private List<String> tokenize(String query, Analyzer analyzer) throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute charTermAttr = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
    List<String> tokens = new ArrayList<String>();

    tokenStream.reset();/*from   ww w . j  a  v  a2 s .  com*/
    while (tokenStream.incrementToken()) {
        String term = charTermAttr.toString();
        String text = term;
        if (text != null)
            tokens.add(term);
    }
    return tokens;
}

From source file:indexer.IndexPrinter.java

String getAnalyzedContent(String content) throws IOException {
    StringBuffer tokenizedContentBuff = new StringBuffer();
    Analyzer analyzer = new StandardAnalyzer();
    TokenStream stream = analyzer.tokenStream(TextDocIndexer.FIELD_ANALYZED_CONTENT, new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from   w ww  .  j a v a  2  s.  c  o m*/

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokenizedContentBuff.append(term).append(" ");
    }
    tokenizedContentBuff.append("\n");
    return tokenizedContentBuff.toString();
}

From source file:indexer.LineDocumentIndexer.java

Document constructDoc(FileWriter fw, String id, String line) throws Exception {

    Document doc = new Document();
    doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED));

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//w w  w . j  a  v a2 s .c  o m

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    tokenizedContentBuff.append("\n");
    fw.write(id + "\t" + tokenizedContentBuff.toString());

    // Reanalyze
    doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    return doc;
}

From source file:indexer.Paragraph.java

List<Paragraph> constructParagraphs(int docId, String content) throws Exception {
    List<Paragraph> parList = new ArrayList<>();

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from  w ww .ja  v  a2s. c  o  m*/

    int count = 0;
    int id = 0;
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokens.add(term);
        count++;
        if (count == paraWindowSize) {
            // create a paragraph
            Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
            tokens.clear();
            count = 0;
            parList.add(p);
        }
    }
    if (count > 0) {
        Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
        parList.add(p);
    }

    stream.end();
    stream.close();

    return parList;
}

From source file:indexer.WordVecSequenceFileGenerator.java

String embedWords(Document d) throws Exception {
    String content = d.get(AMI_FIELDS.FIELD_CONTENT);
    int decScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_DECISION_SCORE)) > 0 ? 1 : 0;
    int prefScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_PREF_SCORE)) > 0 ? 1 : 0;

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from   ww  w. j a  va2  s  . c  o m

    StringBuffer buff = new StringBuffer();
    boolean labelsStoredWithWords = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));

    while (stream.incrementToken()) {
        String term = termAtt.toString().toLowerCase();
        String[] wordAndLabel = null;

        if (labelsStoredWithWords) {
            wordAndLabel = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM);
            term = wordAndLabel[0]; // the first part is the word
            decScore = Integer.parseInt(wordAndLabel[1]);
            prefScore = Integer.parseInt(wordAndLabel[2]);
        }

        double[] x = wvecs.getWordVector(term);
        if (x == null) {
            System.err.println("No vec found for word " + term);
            continue;
        }

        String wvec = vecToStr(x);
        if (decScore > 1)
            decScore = 1;
        if (prefScore > 1)
            prefScore = 1;
        buff.append(wvec).append("\t").append(decScore).append("\t").append(prefScore).append("\n");
    }
    stream.close();

    return buff.toString();
}

From source file:indexing.ReviewTextAnalyzer.java

License:Open Source License

/**
 * @param args/*  w ww.jav a  2s  .c o  m*/
 */
public static void main(String[] args) {
    ReviewTextAnalyzer r = new ReviewTextAnalyzer(new ReviewDocumentIndexer());
    String[] filenames = { "review.txt" };
    for (String filename : filenames) {
        try {
            TokenStream tokstr = r.reusableTokenStream(null, new FileReader(filename));

            TermAttribute output_term = tokstr.addAttribute(TermAttribute.class);
            TypeAttribute output_type = tokstr.addAttribute(TypeAttribute.class);
            FlagsAttribute output_flags = tokstr.addAttribute(FlagsAttribute.class);
            PayloadAttribute output_payload = tokstr.addAttribute(PayloadAttribute.class);

            int review_id = r.indexer.theReviewId.get() + 1;
            r.indexer.theReviewId.set(review_id);
            r.indexer.theStats.setCurrent(review_id, 10);

            while (tokstr.incrementToken()) {

                Token current_token = new Token(output_term.term(), output_type.type(), output_flags.getFlags(),
                        new ReviewTermPayload(output_payload.getPayload()));

                System.out.print(current_token);

                if (current_token.isDelim(false)) {
                    System.out.println();
                }
                if (current_token.isDelim(true)) {
                    System.out.println("..................................................................\n");
                }
            }

            System.out.println();

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        System.out.println(
                "\n\n\n\n\n\n\n\n==================================================================\n\n\n\n\n\n\n\n");
    }

    return;
}

From source file:info.johtani.elasticsearch.action.admin.indices.extended.analyze.TransportExtendedAnalyzeAction.java

License:Apache License

private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream,
        Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset)
        throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();/*from   www.  j av a  2  s .  co m*/

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition,
                lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(),
                extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}