List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.apache.tika.eval.tokens.TokenCounter.java
License:Apache License
private void _add(String field, Analyzer analyzer, String content) throws IOException { int totalTokens = 0; TokenStream ts = analyzer.tokenStream(field, content); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();//from w ww . j a va 2s.c o m Map<String, MutableInt> tokenMap = map.get(field); if (tokenMap == null) { tokenMap = new HashMap<>(); map.put(field, tokenMap); } while (ts.incrementToken()) { String token = termAtt.toString(); MutableInt cnt = tokenMap.get(token); if (cnt == null) { cnt = new MutableInt(1); tokenMap.put(token, cnt); } else { cnt.increment(); } totalTokens++; } ts.close(); ts.end(); int totalUniqueTokens = tokenMap.size(); double ent = 0.0d; double p = 0.0d; double base = 2.0; TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) { String token = e.getKey(); int termFreq = e.getValue().intValue(); p = (double) termFreq / (double) totalTokens; ent += p * FastMath.log(base, p); int len = token.codePointCount(0, token.length()); for (int i = 0; i < e.getValue().intValue(); i++) { summaryStatistics.addValue(len); } if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(token, termFreq)); } } if (totalTokens > 0) { ent = (-1.0d / (double) totalTokens) * ent; } /* Collections.sort(allTokens); List<TokenIntPair> topNList = new ArrayList<>(topN); for (int i = 0; i < topN && i < allTokens.size(); i++) { topNList.add(allTokens.get(i)); }*/ tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics)); }
From source file:org.apache.tika.eval.tokens.TokenCounterTest.java
License:Apache License
@Test public void testCJKFilter() throws Exception { String s = "then quickbrownfoxjumpedoverthelazy dogss dog "; Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = analyzer.tokenStream(FIELD, s); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();/* ww w . j a v a2 s . co m*/ Map<String, Integer> tokens = new HashMap<>(); while (ts.incrementToken()) { String t = termAtt.toString(); Integer count = tokens.get(t); count = (count == null) ? count = 0 : count; count++; tokens.put(t, count); } ts.end(); ts.close(); assertEquals(7, tokens.size()); assertEquals(new Integer(1), tokens.get("")); }
From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java
License:Apache License
@Override public boolean incrementToken() throws IOException { if (!initialized) init();//from w w w. j a va2 s .co m if (sortedStreams.size() == 0) return false; TokenStream currentTokenStream = sortedStreams.pop(); restoreState(currentTokenStream.captureState()); OffsetAttribute offsetAttr = (OffsetAttribute) currentTokenStream.getAttribute(OffsetAttribute.class); if (offsetAttr.startOffset() == currentOffset) posIncAtt.setPositionIncrement(0); else posIncAtt.setPositionIncrement(1); currentOffset = offsetAttr.startOffset(); // proceed the token stream to its next token and resort the stack if (currentTokenStream.incrementToken()) sortedStreams.add(currentTokenStream); rebuildSortedTokens(); return true; }
From source file:org.apache.uima.lucas.ProspectiveSearchAE.java
License:Apache License
@Override public void process(CAS aCAS) throws AnalysisEngineProcessException { // First create the index of the document text MemoryIndex index = new MemoryIndex(); List fields = createDocument(aCAS).getFields(); for (Iterator it = fields.iterator(); it.hasNext();) { Field field = (Field) it.next(); if (field.isIndexed() && field.tokenStreamValue() != null) { index.addField(field.name(), field.tokenStreamValue()); }//w w w . j av a2 s.co m } // Search all queries against the one document index for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) { float score = index.search(query.query()); if (score > matchingThreshold) { // Add a FS to the CAS with the search result FeatureStructure searchResult = aCAS.createFS(searchResultType); searchResult.setLongValue(searchResultIdFeature, query.id()); aCAS.addFsToIndexes(searchResult); // Find matching tokens and link their annotations // in case the user wants search term highlighting if (searchResultMatchingTextFeature != null) { fields = createDocument(aCAS).getFields(); for (Iterator it = fields.iterator(); it.hasNext();) { Field field = (Field) it.next(); if (field.isIndexed() && field.tokenStreamValue() != null) { TokenStream tokenStream = field.tokenStreamValue(); Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>(); QueryScorer scorer = new QueryScorer(query.query(), field.name()); scorer.startFragment(new TextFragment(new StringBuffer(aCAS.getDocumentText()), 0, 0)); try { scorer.init(tokenStream); OffsetAttribute offsetAttr = null; while (tokenStream.incrementToken()) { offsetAttr = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); float tokenScore = scorer.getTokenScore(); if (tokenScore > 0) { AnnotationFS annotation = aCAS.createAnnotation(matchingTextType, offsetAttr.startOffset(), offsetAttr.endOffset()); matchingTextAnnotations.add(annotation); } } } catch (IOException e) { throw new AnalysisEngineProcessException(e); } ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size()); int matchtingTextArrayIndex = 0; for (AnnotationFS matchingTextAnnotation : matchingTextAnnotations) { matchtingTextArray.set(matchtingTextArrayIndex++, matchingTextAnnotation); } searchResult.setFeatureValue(searchResultMatchingTextFeature, matchtingTextArray); } } } } } }
From source file:org.apache.usergrid.utils.IndexUtils.java
License:Apache License
public static List<String> keywords(String source) { TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source)); List<String> keywords = new ArrayList<String>(); try {/*from w ww .ja v a 2 s . c o m*/ while (ts.incrementToken()) { keywords.add(ts.getAttribute(TermAttribute.class).term()); } } catch (IOException e) { LOG.error("Error getting keywords ", e); } return keywords; }
From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java
License:Open Source License
private static String segText(String text) { StringBuilder result = new StringBuilder(); try {/*from w w w . ja va 2 s . c om*/ TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text)); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); result.append(charTermAttribute.toString()).append(" "); } tokenStream.close(); } catch (Exception e) { e.printStackTrace(); } return result.toString(); }
From source file:org.apdplat.word.elasticsearch.ChineseWordIndicesAnalysisTest.java
License:Open Source License
@Test public void testChineseWordIndicesAnalysis() throws IOException { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(SETTINGS), new EnvironmentModule(new Environment(SETTINGS)), new IndicesAnalysisModule()).createInjector(); Injector injector = new ModulesBuilder() .add(new IndexSettingsModule(index, SETTINGS), new IndexNameModule(index), new AnalysisModule(SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)) .addProcessor(new ChineseWordAnalysisBinderProcessor())) .createChildInjector(parentInjector); AnalysisService analysisService = injector.getInstance(AnalysisService.class); TokenizerFactory tokenizerFactory = analysisService.tokenizer("word"); boolean match = (tokenizerFactory instanceof ChineseWordTokenizerFactory); assertTrue(match);/*from w ww .jav a 2 s . c o m*/ Tokenizer tokenizer = tokenizerFactory.create(new StringReader("?")); String exp = "[, ?]"; List<String> result = new ArrayList<>(); while (tokenizer.incrementToken()) { CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); result.add(charTermAttribute.toString()); } assertEquals(exp, result.toString()); Analyzer analyzer = analysisService.analyzer("word").analyzer(); match = (analyzer instanceof ChineseWordAnalyzer); assertTrue(match); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); exp = "[??, apdplat, , ??, ?, ]"; result = new ArrayList<>(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); result.add(charTermAttribute.toString()); } assertEquals(exp, result.toString()); }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java
License:Open Source License
public static void main(String args[]) throws IOException { Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); tokenStream.reset();/* ww w . j a va2s .co m*/ while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); tokenStream = analyzer.tokenStream("text", "word????????ysc"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); tokenStream = analyzer.tokenStream("text", "5?"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test1() { try {//from w w w .ja v a 2s . c om Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , apdplat, , ?, ?, ?, , ]"; if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) { expResult = "[??, , apdplat, , , ?, ?, ?, , ]"; } assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test2() { try {/*from w w w .j a v a 2s . c om*/ Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , , , , , ]"; assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }