List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.apache.tika.eval.tokens.TokenCounter.java
License:Apache License
private void _add(String field, Analyzer analyzer, String content) throws IOException { int totalTokens = 0; TokenStream ts = analyzer.tokenStream(field, content); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();//from www. j a va 2 s .c om Map<String, MutableInt> tokenMap = map.get(field); if (tokenMap == null) { tokenMap = new HashMap<>(); map.put(field, tokenMap); } while (ts.incrementToken()) { String token = termAtt.toString(); MutableInt cnt = tokenMap.get(token); if (cnt == null) { cnt = new MutableInt(1); tokenMap.put(token, cnt); } else { cnt.increment(); } totalTokens++; } ts.close(); ts.end(); int totalUniqueTokens = tokenMap.size(); double ent = 0.0d; double p = 0.0d; double base = 2.0; TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) { String token = e.getKey(); int termFreq = e.getValue().intValue(); p = (double) termFreq / (double) totalTokens; ent += p * FastMath.log(base, p); int len = token.codePointCount(0, token.length()); for (int i = 0; i < e.getValue().intValue(); i++) { summaryStatistics.addValue(len); } if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(token, termFreq)); } } if (totalTokens > 0) { ent = (-1.0d / (double) totalTokens) * ent; } /* Collections.sort(allTokens); List<TokenIntPair> topNList = new ArrayList<>(topN); for (int i = 0; i < topN && i < allTokens.size(); i++) { topNList.add(allTokens.get(i)); }*/ tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics)); }
From source file:org.apache.tika.eval.tokens.TokenCounterTest.java
License:Apache License
@Test public void testCJKFilter() throws Exception { String s = "then quickbrownfoxjumpedoverthelazy dogss dog "; Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = analyzer.tokenStream(FIELD, s); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();//w w w . ja v a2s . co m Map<String, Integer> tokens = new HashMap<>(); while (ts.incrementToken()) { String t = termAtt.toString(); Integer count = tokens.get(t); count = (count == null) ? count = 0 : count; count++; tokens.put(t, count); } ts.end(); ts.close(); assertEquals(7, tokens.size()); assertEquals(new Integer(1), tokens.get("")); }
From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java
License:Open Source License
private static String segText(String text) { StringBuilder result = new StringBuilder(); try {/*from w w w .j ava 2s. c o m*/ TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text)); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); result.append(charTermAttribute.toString()).append(" "); } tokenStream.close(); } catch (Exception e) { e.printStackTrace(); } return result.toString(); }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java
License:Open Source License
public static void main(String args[]) throws IOException { Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); tokenStream.reset();/*from w ww .j a va 2 s . co m*/ while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); tokenStream = analyzer.tokenStream("text", "word????????ysc"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); tokenStream = analyzer.tokenStream("text", "5?"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test1() { try {//from w w w . ja va2 s. com Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , apdplat, , ?, ?, ?, , ]"; if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) { expResult = "[??, , apdplat, , , ?, ?, ?, , ]"; } assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test2() { try {/* w w w .j av a 2s .c o m*/ Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , , , , , ]"; assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }
From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java
License:Open Source License
private static String analyzeTextToFind(String textToFind) throws IOException { // Filter textToFind through GreekAnalyzer TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind)); stream.reset();/*from w w w. j a v a 2 s . c om*/ StringBuilder analyzedTextTofind = new StringBuilder(); try { while (stream.incrementToken()) { String term = stream.getAttribute(TermAttribute.class).term(); analyzedTextTofind.append(term); analyzedTextTofind.append(" "); } } catch (IOException e) { e.printStackTrace(); analyzedTextTofind.append(textToFind); } finally { stream.end(); stream.close(); } String result = analyzedTextTofind.toString().trim(); if (StringUtils.isBlank(result)) return textToFind; return result; }
From source file:org.chombo.util.BasicUtils.java
License:Apache License
/** * Analyzes text and return analyzed text * @param text/*from ww w . j av a 2 s .c om*/ * @return * @throws IOException */ public static String analyze(String text, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); StringBuilder stBld = new StringBuilder(); stream.reset(); CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String token = termAttribute.toString(); stBld.append(token).append(" "); } stream.end(); stream.close(); return stBld.toString(); }
From source file:org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java
License:Apache License
/** NOTE: this method closes the TokenStream, even on exception, which is awkward * because really the caller who called {Analyzer#tokenStream} should close it, * but when trying that there are recursion issues when we try to use the same * TokenStream twice in the same recursion... */ public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException { int numTokens = 0; boolean success = false; try {//from w w w.j ava 2 s .c om stream.reset(); consumer.reset(stream); while (stream.incrementToken()) { consumer.nextToken(); numTokens++; } consumer.end(); success = true; } finally { if (success) { stream.close(); } else { IOUtils.closeWhileHandlingException(stream); } } return numTokens; }
From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizerTest.java
License:Apache License
@Test public void testNullSynonyms() throws Exception { Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1); TokenStream stream = a.tokenStream("f", new StringReader("")); stream.reset();/*from w ww . j a va2 s . co m*/ assertTokenStream(stream, ",0,1,1/,1,2,1/,2,3,1/,3,4,1/,4,5,1/,5,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(2); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,2,1/,1,3,1/,2,4,1/,3,5,1/,4,6,1"); stream.close(); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,1,1"); stream.close(); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,2,1"); a = new NGramSynonymTokenizerTestAnalyzer(3); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,3,1/,1,4,1/,2,5,1/,3,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(4); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,4,1/,1,5,1/,2,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(5); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,5,1/,1,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(6); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(7); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); a = new NGramSynonymTokenizerTestAnalyzer(8); stream = a.tokenStream("f", new StringReader("")); stream.reset(); assertTokenStream(stream, ",0,6,1"); }