Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:fr.inrialpes.exmo.ontosim.VectorSpaceMeasure.java

License:Open Source License

/**
 * add all words contained in toAnalyse into words collection. Words are stemmed.
 * @param toAnalyse : the string to be analysed
 * @param words : the collection to add extracted words
 *//*  w w w  .ja  v a  2s . com*/
protected void analyseString(String toAnalyse, Collection<String> words) {
    TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse));
    TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class);
    try {
        while (tokenS.incrementToken()) {
            words.add(termAtt.term());
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:gr.aueb.demo.PropertyRegistryBean.java

public static String removeStopWords(String textFile) {
    //CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    CharArraySet stopWords = PropertyRegistryBean.stopSet;
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(textFile.trim()));
    tokenStream = new StopFilter(Version.LUCENE_48, tokenStream, stopWords);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    try {/*w ww  .  j  ava 2  s . c o m*/
        tokenStream.reset();
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    try {
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            sb.append(term + " ");
        }
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    return sb.toString();
}

From source file:ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {// www  .j  a v a2s. c o m
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private List<String> tokenize(String query, Analyzer analyzer) throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute charTermAttr = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
    List<String> tokens = new ArrayList<String>();

    tokenStream.reset();/*from   ww w . j  a  v  a2 s .  com*/
    while (tokenStream.incrementToken()) {
        String term = charTermAttr.toString();
        String text = term;
        if (text != null)
            tokens.add(term);
    }
    return tokens;
}

From source file:indexer.IndexPrinter.java

String getAnalyzedContent(String content) throws IOException {
    StringBuffer tokenizedContentBuff = new StringBuffer();
    Analyzer analyzer = new StandardAnalyzer();
    TokenStream stream = analyzer.tokenStream(TextDocIndexer.FIELD_ANALYZED_CONTENT, new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from   w ww  .  j a v a  2  s.  c  o m*/

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokenizedContentBuff.append(term).append(" ");
    }
    tokenizedContentBuff.append("\n");
    return tokenizedContentBuff.toString();
}

From source file:indexer.LineDocumentIndexer.java

Document constructDoc(FileWriter fw, String id, String line) throws Exception {

    Document doc = new Document();
    doc.add(new Field(DocVector.FIELD_ID, id, Field.Store.YES, Field.Index.NOT_ANALYZED));

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(FIELD_WORDS, new StringReader(line));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//w w  w . j  a  v a2 s .c  o m

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    tokenizedContentBuff.append("\n");
    fw.write(id + "\t" + tokenizedContentBuff.toString());

    // Reanalyze
    doc.add(new Field(FIELD_WORDS, line, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
    return doc;
}

From source file:indexer.Paragraph.java

List<Paragraph> constructParagraphs(int docId, String content) throws Exception {
    List<Paragraph> parList = new ArrayList<>();

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from  w ww .ja  v  a2s. c  o  m*/

    int count = 0;
    int id = 0;
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        tokens.add(term);
        count++;
        if (count == paraWindowSize) {
            // create a paragraph
            Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
            tokens.clear();
            count = 0;
            parList.add(p);
        }
    }
    if (count > 0) {
        Paragraph p = new Paragraph(docId + "_" + String.valueOf(id++), tokens);
        parList.add(p);
    }

    stream.end();
    stream.close();

    return parList;
}

From source file:indexer.WordVecSequenceFileGenerator.java

String embedWords(Document d) throws Exception {
    String content = d.get(AMI_FIELDS.FIELD_CONTENT);
    int decScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_DECISION_SCORE)) > 0 ? 1 : 0;
    int prefScore = Integer.parseInt(d.get(AMI_FIELDS.FIELD_PREF_SCORE)) > 0 ? 1 : 0;

    List<String> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(content));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from   ww  w. j a  va2  s  . c  o m

    StringBuffer buff = new StringBuffer();
    boolean labelsStoredWithWords = Boolean.parseBoolean(prop.getProperty("word.labels", "false"));

    while (stream.incrementToken()) {
        String term = termAtt.toString().toLowerCase();
        String[] wordAndLabel = null;

        if (labelsStoredWithWords) {
            wordAndLabel = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM);
            term = wordAndLabel[0]; // the first part is the word
            decScore = Integer.parseInt(wordAndLabel[1]);
            prefScore = Integer.parseInt(wordAndLabel[2]);
        }

        double[] x = wvecs.getWordVector(term);
        if (x == null) {
            System.err.println("No vec found for word " + term);
            continue;
        }

        String wvec = vecToStr(x);
        if (decScore > 1)
            decScore = 1;
        if (prefScore > 1)
            prefScore = 1;
        buff.append(wvec).append("\t").append(decScore).append("\t").append(prefScore).append("\n");
    }
    stream.close();

    return buff.toString();
}

From source file:indexing.ReviewTextAnalyzer.java

License:Open Source License

/**
 * @param args/*  w ww.jav a  2s  .c o  m*/
 */
public static void main(String[] args) {
    ReviewTextAnalyzer r = new ReviewTextAnalyzer(new ReviewDocumentIndexer());
    String[] filenames = { "review.txt" };
    for (String filename : filenames) {
        try {
            TokenStream tokstr = r.reusableTokenStream(null, new FileReader(filename));

            TermAttribute output_term = tokstr.addAttribute(TermAttribute.class);
            TypeAttribute output_type = tokstr.addAttribute(TypeAttribute.class);
            FlagsAttribute output_flags = tokstr.addAttribute(FlagsAttribute.class);
            PayloadAttribute output_payload = tokstr.addAttribute(PayloadAttribute.class);

            int review_id = r.indexer.theReviewId.get() + 1;
            r.indexer.theReviewId.set(review_id);
            r.indexer.theStats.setCurrent(review_id, 10);

            while (tokstr.incrementToken()) {

                Token current_token = new Token(output_term.term(), output_type.type(), output_flags.getFlags(),
                        new ReviewTermPayload(output_payload.getPayload()));

                System.out.print(current_token);

                if (current_token.isDelim(false)) {
                    System.out.println();
                }
                if (current_token.isDelim(true)) {
                    System.out.println("..................................................................\n");
                }
            }

            System.out.println();

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        System.out.println(
                "\n\n\n\n\n\n\n\n==================================================================\n\n\n\n\n\n\n\n");
    }

    return;
}

From source file:info.johtani.elasticsearch.action.admin.indices.extended.analyze.TransportExtendedAnalyzeAction.java

License:Apache License

private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream,
        Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset)
        throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();/*from   www.  j av a  2  s .  co m*/

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition,
                lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(),
                extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}