List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java
License:Apache License
@Test @Ignore("until we fix bigrams") public void testCJKUnigrams() throws Exception { final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET; int posIncGap = 10; final int charOffsetGap = 10; // Analyzer analyzer = getBigramAnalyzer(stops, 10, 10, true); Analyzer analyzer = getCJKBigramAnalyzer(true); TokenStream ts = analyzer.tokenStream(FIELD, "?"); ts.reset(); String[] docs = new String[] { "a b c d e f g" }; Directory directory = getDirectory(analyzer, docs); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); ConcordanceSearcher searcher = new ConcordanceSearcher( new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD))); Query q = new TermQuery(new Term(FIELD, "c")); //now test straight and span wrapper ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); searcher.search(indexSearcher, FIELD, q, q, analyzer, collector); for (ConcordanceWindow w : collector.getWindows()) { //System.out.println(w); }/*from w ww. j a v a 2s . c om*/ reader.close(); directory.close(); }
From source file:org.tightblog.service.indexer.AbstractTask.java
License:Apache License
/** * Create a lucene term from the first token of the input string. * * @param field The lucene document field to create a term with * @param input The input you wish to convert into a term * @return Lucene search term//from w w w. ja va 2s . c o m */ Term getTerm(String field, String input) { Term term = null; if (input != null && field != null) { try (Analyzer analyzer = manager.getAnalyzer()) { if (analyzer != null) { try { TokenStream tokens = analyzer.tokenStream(field, new StringReader(input)); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); tokens.reset(); if (tokens.incrementToken()) { String termt = termAtt.toString(); term = new Term(field, termt); } } catch (IOException e) { // ignored } } } } return term; }
From source file:org.weborganic.flint.util.Fields.java
License:artistic-license-2.0
/** * Returns the terms for a field//from www .ja v a 2 s . c o m * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ public static List<String> toTerms(String field, String text, Analyzer analyzer) { StringReader r = new StringReader(text); TokenStream stream = analyzer.tokenStream(field, r); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); TermAttribute attribute = stream.addAttribute(TermAttribute.class); List<String> terms = new ArrayList<String>(); try { stream.reset(); while (stream.incrementToken()) { String term = attribute.term(); terms.add(term); // TODO Use increment for the phrase query // System.err.println(term+":"+increment.getPositionIncrement()); } } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } return terms; }
From source file:org.weborganic.flint.util.Queries.java
License:artistic-license-2.0
/** * Returns the terms for a field/*from w w w. ja v a 2s. com*/ * * @param field The field * @param text The text to analyze * @param analyzer The analyzer * * @return the corresponding list of terms produced by the analyzer. * * @throws IOException */ private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) { StringReader r = new StringReader(text); TokenStream stream = analyzer.tokenStream(field, r); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); TermAttribute attribute = stream.addAttribute(TermAttribute.class); try { int position = -1; stream.reset(); while (stream.incrementToken()) { position += increment.getPositionIncrement(); Term term = new Term(field, attribute.term()); phrase.add(term, position); } } catch (IOException ex) { // Should not occur since we use a StringReader ex.printStackTrace(); } }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerCase.java
License:Apache License
public static ArrayList<String> getTopicWord(String str) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); ArrayList<String> retData = new ArrayList<String>(); // ?LuceneTokenStream TokenStream ts = null; try {// w w w. j a v a 2s.c o m ts = analyzer.tokenStream("myfield", new StringReader(str)); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); if (term.toString().length() > 1 || term.toString().matches("^[0-9]*$")) { retData.add(term.toString()); } } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } return retData; }
From source file:org.wltea.analyzer.ikanalyzer.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { // IK?smart?? Analyzer analyzer = new IKAnalyzer(true); // ?LuceneTokenStream TokenStream ts = null; try {//from w w w.j av a 2s . co m ts = analyzer.tokenStream("myfield", new StringReader("???")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (IOException e) { e.printStackTrace(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.ik_analyzer5.IKAnalzyerTest.java
License:Apache License
@Test public void testIK() { String text = "???"; //IK?smart??/*from ww w . ja va 2s. c o m*/ Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { ts = analyzer.tokenStream("myfield", new StringReader(text)); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } analyzer.close(); } }
From source file:org.wltea.analyzer.sample.IKAnalzyerDemo.java
License:Apache License
public static void main(String[] args) { //IK?smart??// w ww . jav a 2s . c om Analyzer analyzer = new IKAnalyzerP(true); //?LuceneTokenStream TokenStream ts = null; try { // ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO")); ts = analyzer.tokenStream("myfield", new StringReader( "?????IKAnalyer can analysis english text too")); // ts = analyzer.tokenStream("myfield", new StringReader("???pinyin hanyu Contribute index to jpinyin development by creating an account on GitHub")); //??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?StandardTokenizer//from w ww .ja v a2 s. co m */ public void testST() { Tokenizer tokenizer = new StandardTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }
From source file:org.wltea.analyzer.sample.LuceneTokenizerDemo.java
License:Apache License
/** * ?ClassTokenizer//from w ww. j a v a2 s .c o m */ public void testCT() { Tokenizer tokenizer = new ClassicTokenizer(); try { tokenizer.setReader(new StringReader( "?????IKAnalyer can analysis english text too")); } catch (IOException e) { throw new RuntimeException(); } TokenStreamComponents tsc = new TokenStreamComponents(tokenizer); TokenStream ts = tsc.getTokenStream(); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); TypeAttribute type = ts.addAttribute(TypeAttribute.class); try { ts.reset(); while (ts.incrementToken()) { System.out.println(term.toString() + "->" + offset.startOffset() + "-" + offset.endOffset() + "->" + type.type()); } ts.end(); } catch (IOException e) { throw new RuntimeException(); } }