Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:test.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B 
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    stream.reset();
    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }//w w  w  .  j  a  va 2 s . c  o  m

        System.out.print("[" + // #E
                term + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    System.out.println();
    stream.close();
}

From source file:test.AnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    Analyzer analyzer = new BaseAnalyzer();
    // Analyzer analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer();
    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/* w  w  w  . j a v a 2  s. co  m*/
        ts = analyzer.tokenStream("myfield", new StringReader(
                "????????????????2?3noneok???BaseAnalyer can analysis english text too"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);
        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
    } catch (IOException e) {
        e.printStackTrace();
        analyzer.close();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:TesterClasses.TestAnalyzer.java

public static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {//from  w  ww  .j av  a2 s.  c o  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;
}

From source file:tfidf.TestTfIDF.java

License:CDDL license

public static ArrayList<String> cutWords(String line) throws IOException {

    ArrayList<String> words = new ArrayList<String>();
    //        String text = ReadFiles.readFile(file);

    IKAnalyzer analyzer = new IKAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(line));
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        //            System.out.println(termAttribute.toString()+"\t"+i);
        words.add(termAttribute.toString());
    }/* www  .  j  a  v a  2 s  . c om*/
    return words;
}

From source file:tw.com.kyle.luminance.LumPositionMap.java

public static LumPositionMap Get(String raw_text) throws IOException {
    StandardAnalyzer analyzer = new StandardAnalyzer();
    TokenStream tstream = analyzer.tokenStream("", raw_text);

    CharTermAttribute termAttr = tstream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offAttr = tstream.getAttribute(OffsetAttribute.class);
    // PositionIncrementAttribute posIncAttr = tstream.getAttribute(PositionIncrementAttribute.class);        
    // PositionLengthAttribute posLenAttr = tstream.getAttribute(PositionLengthAttribute.class);

    List<String> tokens = new ArrayList<>();
    List<Integer> pos_list = new ArrayList<>();

    int pos_counter = 0;
    tstream.reset();
    while (tstream.incrementToken()) {
        tokens.add(termAttr.toString());
        pos_list.add(offAttr.startOffset());
    }/*from w w w .  ja va  2  s  . c  o  m*/

    return new LumPositionMap(tokens, pos_list);
}

From source file:tw.com.kyle.luminance.LumWindow.java

public List<LumRange> BuildLumRange(long annot_uuid) throws IOException {
    Document adoc = lum_annot.GetAnnotDocument(annot_uuid);
    if (adoc == null) {
        return new ArrayList<>();
    }//www.  j  a  v  a  2s  . c o m

    int doc_id = lum_reader.getDocId(adoc);
    TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, "anno");
    if (tokenStream == null) {
        return null;
    }

    OffsetAttribute offAttr = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute chAttr = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();
    List<LumRange> lr_list = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        LumRange lr = new LumRange();
        lr.data = chAttr.toString();
        lr.start_off = offAttr.startOffset();
        lr.end_off = offAttr.endOffset();
        lr_list.add(lr);
    }

    return lr_list;
}

From source file:tw.com.kyle.luminance.LumWindow.java

private Mappings prepare_mappings(int doc_id, String field) throws IOException {
    List<Integer> pos_list = new ArrayList<>();
    List<Integer> off_list = new ArrayList<>();

    TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, field);
    if (tokenStream == null) {
        return null;
    }/*w  w  w .  ja v a 2s . c  o  m*/

    OffsetAttribute offsetAttr = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posincAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();
    int pos_counter = 0;
    while (tokenStream.incrementToken()) {
        pos_list.add(pos_counter);
        off_list.add(offsetAttr.startOffset());
        pos_counter += posincAttr.getPositionIncrement();
    }

    Mappings mappings = new Mappings();
    mappings.off_list = off_list;
    mappings.pos_list = pos_list;
    return mappings;
}

From source file:tweetembeding.AnalyzerClass.java

public String analizeString(String FIELD, String txt) throws IOException {
    this.analyzer = setAnalyzer();
    TokenStream stream = analyzer.tokenStream(FIELD, new StringReader(txt));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    StringBuffer tokenizedContentBuff = new StringBuffer();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        if (!term.equals("nbsp"))
            tokenizedContentBuff.append(term).append(" ");
    }/*  w  w w  .  ja  va 2s . c o  m*/

    stream.end();
    stream.close();

    return tokenizedContentBuff.toString();
}

From source file:ucas.IKAnalzyerDemo.java

License:Apache License

public static String Spilt2Words(String content) {
    String resString = "";
    //IK?smart??/*from   www . j  a  v a 2  s.  co m*/
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        //myfield??
        ts = analyzer.tokenStream("myfield", new StringReader(content));
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            resString += term.toString() + "|";
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    return resString;
}

From source file:uib.scratch.AnalyzerUtils.java

public static Token insertB(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    TermAttribute term = stream.addAttribute(TermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);

    StringBuilder currenttoken = new StringBuilder(64);
    // currenttoken.append('[');
    char[] character = new char[1];
    int i = posIncr.getPositionIncrement();
    // reset our states :)
    //posIncr/*from w ww .  ja  v a2s .c  o  m*/

    boolean tokenstart = false;
    boolean tokenend = false;
    stream.reset();
    while (stream.incrementToken()) {

        /* end of stream reached ...    
        if (i == 0) return null;
                
        if (character[0] == '[') { // token starts here ...
        tokenstart = true;
        } else if (character[0] == ']') { // token ends here ...
        tokenend = true;
        } else if (tokenstart && !tokenend) { // between end and start ...
        currenttoken.append(character[0]);
        }
        // we found our token and return it ...
        if (tokenstart && tokenend) {
        // currenttoken.append(']');
        // prepend a token because lucene does not allow leading wildcards. 
        //currenttoken.insert(0, '_');*/
        //String tokenString = currenttoken.toString().toLowerCase().replace(' ', '_').trim();
        String tokenString = term.toString();
        Token t = new Token(tokenString, 0, tokenString.length() - 1);
        System.out.println(t);
        //return t;

    }
    return null;
}