List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java
License:Open Source License
public static void main(String args[]) throws IOException { Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); }//from w w w.j a v a 2 s. c o m tokenStream.close(); tokenStream = analyzer.tokenStream("text", "word????????ysc"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); tokenStream = analyzer.tokenStream("text", "5?"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test1() { try {//from w w w . j av a2 s . com Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , apdplat, , ?, ?, ?, , ]"; if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) { expResult = "[??, , apdplat, , , ?, ?, ?, , ]"; } assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test2() { try {/*from w w w. j av a 2 s.co m*/ Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , , , , , ]"; assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }
From source file:org.archive.porky.TokenizeTextUDF.java
License:Apache License
public String exec(Tuple input) throws IOException { String emptyString = ""; if (input == null || input.size() == 0) { return emptyString; }//from w w w .ja va 2 s .co m try { String textString = (String) input.get(0); if (textString == null) { return emptyString; } if (stopSet == null) { //initialize List<String> stopWords = new ArrayList<String>(); //read in stop words file // Open the file as a local file. FileReader fr = new FileReader(stopWordsFile); BufferedReader d = new BufferedReader(fr); String line; while ((line = d.readLine()) != null) { stopWords.add(line); } fr.close(); stopSet = new CharArraySet(Version.LUCENE_45, stopWords, true); } TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_45, new StringReader(textString)); tokenStream = new StopFilter(Version.LUCENE_45, tokenStream, stopSet); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); sb.append(term + " "); } return sb.toString(); } catch (Exception e) { return emptyString; } }
From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java
License:Open Source License
private static String analyzeTextToFind(String textToFind) throws IOException { // Filter textToFind through GreekAnalyzer TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind)); stream.reset(); StringBuilder analyzedTextTofind = new StringBuilder(); try {/*from w ww .j a v a2s. c o m*/ while (stream.incrementToken()) { String term = stream.getAttribute(TermAttribute.class).term(); analyzedTextTofind.append(term); analyzedTextTofind.append(" "); } } catch (IOException e) { e.printStackTrace(); analyzedTextTofind.append(textToFind); } finally { stream.end(); stream.close(); } String result = analyzedTextTofind.toString().trim(); if (StringUtils.isBlank(result)) return textToFind; return result; }
From source file:org.bibsonomy.lucene.search.LuceneResourceSearch.java
License:Open Source License
/** * analyzes given input parameter/*w w w. jav a 2 s .co m*/ * * @param fieldName the name of the field * @param param the value of the field * @return the analyzed string * @throws IOException */ protected String parseToken(final String fieldName, final String param) throws IOException { if (present(param)) { // use lucene's new token stream api (see org.apache.lucene.analysis' javadoc at package level) final TokenStream ts = this.getAnalyzer().tokenStream(fieldName, new StringReader(param)); final TermAttribute termAtt = ts.addAttribute(TermAttribute.class); ts.reset(); // analyze the parameter - that is: concatenate its normalized tokens final StringBuilder analyzedString = new StringBuilder(); while (ts.incrementToken()) { analyzedString.append(" ").append(termAtt.term()); } return analyzedString.toString().trim(); } return ""; }
From source file:org.chombo.util.BasicUtils.java
License:Apache License
/** * Analyzes text and return analyzed text * @param text/*from w w w . j a v a 2 s. co m*/ * @return * @throws IOException */ public static String analyze(String text, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); StringBuilder stBld = new StringBuilder(); stream.reset(); CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String token = termAttribute.toString(); stBld.append(token).append(" "); } stream.end(); stream.close(); return stBld.toString(); }
From source file:org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java
License:Apache License
/** NOTE: this method closes the TokenStream, even on exception, which is awkward * because really the caller who called {Analyzer#tokenStream} should close it, * but when trying that there are recursion issues when we try to use the same * TokenStream twice in the same recursion... */ public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException { int numTokens = 0; boolean success = false; try {//from w w w . ja v a 2 s.com stream.reset(); consumer.reset(stream); while (stream.incrementToken()) { consumer.nextToken(); numTokens++; } consumer.end(); success = true; } finally { if (success) { stream.close(); } else { IOUtils.closeWhileHandlingException(stream); } } return numTokens; }
From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java
License:Apache License
@Test public void testNullSynonyms() throws Exception { Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1); TokenStream stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,1,1/,1,2,1/,2,3,1/,3,4,1/,4,5,1/,5,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(2); stream = a.tokenStream("f", new StringReader("")); stream.reset();//ww w. jav a 2s . co m assertTokenStream(stream, ",0,2,1/,1,3,1/,2,4,1/,3,5,1/,4,6,1"); stream.close(); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,1,1"); stream.close(); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,2,1"); a = new NGramSynonymTokenizerTestAnalyzer(3); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,3,1/,1,4,1/,2,5,1/,3,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(4); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,4,1/,1,5,1/,2,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(5); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,5,1/,1,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(6); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(7); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(8); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); }
From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java
License:Apache License
@Test public void testSingleSynonymIgnoreCase() throws Exception { Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, false, "A,AA,AAA"); TokenStream stream = a.tokenStream("f", new StringReader("aaa")); stream.reset(); assertTokenStream(stream, "aaa,0,3,1"); }