Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
@Ignore("until we fix bigrams")
public void testCJKUnigrams() throws Exception {

    final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET;
    int posIncGap = 10;
    final int charOffsetGap = 10;
    //    Analyzer analyzer = getBigramAnalyzer(stops, 10, 10, true);
    Analyzer analyzer = getCJKBigramAnalyzer(true);
    TokenStream ts = analyzer.tokenStream(FIELD, "?");
    ts.reset();

    String[] docs = new String[] { "a b c d e f g" };

    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)));
    Query q = new TermQuery(new Term(FIELD, "c"));
    //now test straight and span wrapper
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
    searcher.search(indexSearcher, FIELD, q, q, analyzer, collector);
    for (ConcordanceWindow w : collector.getWindows()) {
        //System.out.println(w);
    }/*from  w ww. j  a  v  a  2s . c  om*/
    reader.close();
    directory.close();
}

From source file:org.tightblog.service.indexer.AbstractTask.java

License:Apache License

/**
 * Create a lucene term from the first token of the input string.
 *
 * @param field The lucene document field to create a term with
 * @param input The input you wish to convert into a term
 * @return Lucene search term//from w w w.  ja  va  2s  . c  o  m
 */
Term getTerm(String field, String input) {
    Term term = null;

    if (input != null && field != null) {
        try (Analyzer analyzer = manager.getAnalyzer()) {
            if (analyzer != null) {
                try {
                    TokenStream tokens = analyzer.tokenStream(field, new StringReader(input));
                    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
                    tokens.reset();

                    if (tokens.incrementToken()) {
                        String termt = termAtt.toString();
                        term = new Term(field, termt);
                    }
                } catch (IOException e) {
                    // ignored
                }
            }
        }
    }
    return term;
}

From source file:org.weborganic.flint.util.Fields.java

License:artistic-license-2.0

/**
 * Returns the terms for a field//from   www .ja  v  a 2  s .  c  o  m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
public static List<String> toTerms(String field, String text, Analyzer analyzer) {
    StringReader r = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(field, r);
    PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    List<String> terms = new ArrayList<String>();
    try {
        stream.reset();
        while (stream.incrementToken()) {
            String term = attribute.term();
            terms.add(term);
            // TODO Use increment for the phrase query
            //        System.err.println(term+":"+increment.getPositionIncrement());
        }
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
    return terms;
}

From source file:org.weborganic.flint.util.Queries.java

License:artistic-license-2.0

/**
 * Returns the terms for a field/*from w  w w.  ja v a  2s.  com*/
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) {
    StringReader r = new StringReader(text);
    TokenStream stream = analyzer.tokenStream(field, r);
    PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    try {
        int position = -1;
        stream.reset();
        while (stream.incrementToken()) {
            position += increment.getPositionIncrement();
            Term term = new Term(field, attribute.term());
            phrase.add(term, position);
        }
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
}

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java

License:Apache License

public static ArrayList<String> getTopicWord(String str) {
     // IK?smart??
     Analyzer analyzer = new IKAnalyzer(true);
     ArrayList<String> retData = new ArrayList<String>();
     // ?LuceneTokenStream
     TokenStream ts = null;
     try {//  w  w  w.  j a v  a 2s.c  o m
         ts = analyzer.tokenStream("myfield", new StringReader(str));
         // ???
         OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
         // ??
         CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
         // ??
         TypeAttribute type = ts.addAttribute(TypeAttribute.class);

         // ?TokenStream?StringReader
         ts.reset();
         // ??
         while (ts.incrementToken()) {
             System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                     + " | " + type.type());
             if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) {
                 retData.add(term.toString());
             }
         }
         // TokenStreamStringReader
         ts.end(); // Perform end-of-stream operations, e.g. set the final
                   // offset.

     } catch (IOException e) {
         e.printStackTrace();
     } finally {
         // TokenStream?
         if (ts != null) {
             try {
                 ts.close();
             } catch (IOException e) {
                 e.printStackTrace();
             }
         }
     }
     return retData;
 }

From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    // IK?smart??
    Analyzer analyzer = new IKAnalyzer(true);

    // ?LuceneTokenStream
    TokenStream ts = null;
    try {//from w  w w.j  av a 2s .  co  m
        ts = analyzer.tokenStream("myfield", new StringReader("???"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final
                  // offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.ik_analyzer5.IKAnalzyerTest.java

License:Apache License

@Test
public void testIK() {

    String text = "???";

    //IK?smart??/*from   ww w  . ja va  2s.  c o m*/
    Analyzer analyzer = new IKAnalyzer(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        ts = analyzer.tokenStream("myfield", new StringReader(text));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        analyzer.close();
    }

}

From source file:org.wltea.analyzer.sample.IKAnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?smart??//  w ww . jav  a  2s  . c om
    Analyzer analyzer = new IKAnalyzerP(true);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {
        //         ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
        ts = analyzer.tokenStream("myfield", new StringReader(
                "?????IKAnalyer can analysis english text too"));
        //         ts = analyzer.tokenStream("myfield", new StringReader("???pinyin hanyu Contribute index to jpinyin development by creating an account on GitHub"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?StandardTokenizer//from   w  ww .ja  v  a2  s. co m
 */
public void testST() {
    Tokenizer tokenizer = new StandardTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}

From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java

License:Apache License

/**
 * ?ClassTokenizer//from w ww.  j a  v a2  s  .c  o  m
 */
public void testCT() {
    Tokenizer tokenizer = new ClassicTokenizer();
    try {
        tokenizer.setReader(new StringReader(
                "?????IKAnalyer can analysis english text too"));
    } catch (IOException e) {
        throw new RuntimeException();
    }
    TokenStreamComponents tsc = new TokenStreamComponents(tokenizer);
    TokenStream ts = tsc.getTokenStream();
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->"
                    + type.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new RuntimeException();
    }
}