List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testGeneral() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer general = analyzerManager.getGeneralAnalyzer(); TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { seen.add(termAtt.toString());//from w ww .j a va 2s .c o m } ts.end(); ts.close(); assertTrue(seen.contains("the")); assertTrue(seen.contains("and")); assertTrue(seen.contains("dog")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testCommon() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer common = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog"); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { String t = termAtt.toString(); if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) { fail("Shouldn't have found a numeric"); }/*ww w. ja va 2 s . c o m*/ seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("dirty")); assertFalse(seen.contains("the")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testTokenCountFilter() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 101000; i++) { sb.append("the "); }//from w w w .j a v a 2s. c o m TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString()); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); int tokens = 0; while (ts.incrementToken()) { tokens++; } assertEquals(100000, tokens); }
From source file:org.apache.tika.eval.tokens.TokenCounter.java
License:Apache License
private void _add(String field, Analyzer analyzer, String content) throws IOException { int totalTokens = 0; TokenStream ts = analyzer.tokenStream(field, content); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); Map<String, MutableInt> tokenMap = map.get(field); if (tokenMap == null) { tokenMap = new HashMap<>(); map.put(field, tokenMap);// w w w . j av a2 s .c o m } while (ts.incrementToken()) { String token = termAtt.toString(); MutableInt cnt = tokenMap.get(token); if (cnt == null) { cnt = new MutableInt(1); tokenMap.put(token, cnt); } else { cnt.increment(); } totalTokens++; } ts.close(); ts.end(); int totalUniqueTokens = tokenMap.size(); double ent = 0.0d; double p = 0.0d; double base = 2.0; TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) { String token = e.getKey(); int termFreq = e.getValue().intValue(); p = (double) termFreq / (double) totalTokens; ent += p * FastMath.log(base, p); int len = token.codePointCount(0, token.length()); for (int i = 0; i < e.getValue().intValue(); i++) { summaryStatistics.addValue(len); } if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(token, termFreq)); } } if (totalTokens > 0) { ent = (-1.0d / (double) totalTokens) * ent; } /* Collections.sort(allTokens); List<TokenIntPair> topNList = new ArrayList<>(topN); for (int i = 0; i < topN && i < allTokens.size(); i++) { topNList.add(allTokens.get(i)); }*/ tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics)); }
From source file:org.apache.tika.eval.tokens.TokenCounterTest.java
License:Apache License
@Test public void testCJKFilter() throws Exception { String s = "then quickbrownfoxjumpedoverthelazy dogss dog "; Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = analyzer.tokenStream(FIELD, s); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); Map<String, Integer> tokens = new HashMap<>(); while (ts.incrementToken()) { String t = termAtt.toString(); Integer count = tokens.get(t); count = (count == null) ? count = 0 : count; count++;//ww w. ja v a2 s .c o m tokens.put(t, count); } ts.end(); ts.close(); assertEquals(7, tokens.size()); assertEquals(new Integer(1), tokens.get("")); }
From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamConcatenator.java
License:Apache License
public void reset() throws IOException { for (TokenStream tokenStream : tokenStreams) tokenStream.reset(); tokenStreamIterator = tokenStreams.iterator(); if (tokenStreamIterator.hasNext()) currentTokenStream = tokenStreamIterator.next(); else/*from www. j a v a 2 s . c o m*/ currentTokenStream = null; }
From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java
License:Apache License
private void init() throws IOException { for (TokenStream stream : streams) { stream.reset(); stream.incrementToken();//from w ww.j a v a 2s.c o m sortedStreams.add(stream); } rebuildSortedTokens(); initialized = true; }
From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java
License:Apache License
public void reset() throws IOException { for (TokenStream stream : streams) stream.reset(); currentOffset = -1;// ww w .j av a 2 s.c o m sortedStreams.clear(); initialized = false; }
From source file:org.apache.uima.lucas.indexer.FieldBuilder.java
License:Apache License
protected Collection<Field> createStoredFields(String fieldName, TokenStream tokenStream, Store fieldStore, String delimiter, Boolean unique, Index fieldIndex, Boolean omitTF) throws FieldBuildingException { Collection<Field> fields = new ArrayList<Field>(); // Only do indexing if we need a unique, indexed AND stored field. Index index = unique ? fieldIndex : Field.Index.NO; try {//from ww w. ja va 2 s . c o m Field field; if (delimiter != null) { String value = tokenStreamStringConcatenator.tokenStreamToStringWithDelimiter(tokenStream, delimiter); field = new Field(fieldName, value, fieldStore, index); if (unique) field.setOmitTermFreqAndPositions(omitTF); fields.add(field); } else { Token nextToken = tokenStream.next(new Token()); while (nextToken != null) { String value = new String(nextToken.termBuffer(), 0, nextToken.termLength()); field = new Field(fieldName, value, fieldStore, index); if (unique) field.setOmitTermFreqAndPositions(omitTF); fields.add(field); nextToken = tokenStream.next(nextToken); } } tokenStream.reset(); } catch (IOException e) { throw createException(e); } return fields; }
From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java
License:Open Source License
private static String segText(String text) { StringBuilder result = new StringBuilder(); try {/* w w w . ja va 2 s. c om*/ TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text)); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); result.append(charTermAttribute.toString()).append(" "); } tokenStream.close(); } catch (Exception e) { e.printStackTrace(); } return result.toString(); }