List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.apache.solr.spelling.SpellingQueryConverter.java
License:Apache License
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset();//from www . j a v a 2s . c om while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
From source file:org.apache.solr.TestTrie.java
License:Apache License
@Test public void testTokenizer() throws Exception { FieldType type = h.getCore().getLatestSchema().getFieldType("tint"); assertTrue(type instanceof TrieField); String value = String.valueOf(random().nextInt()); TokenStream ts = type.getAnalyzer().tokenStream("dummy", value); OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class); ts.reset();//from w w w . ja v a2s. c o m int count = 0; while (ts.incrementToken()) { count++; assertEquals(0, ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); } final int precStep = ((TrieField) type).getPrecisionStep(); assertEquals((32 + precStep - 1) / precStep, count); ts.end(); assertEquals(value.length(), ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); ts.close(); // Test empty one: ts = type.getAnalyzer().tokenStream("dummy", ""); ts.reset(); assertFalse(ts.incrementToken()); ts.end(); assertEquals(0, ofsAtt.startOffset()); assertEquals(0, ofsAtt.endOffset()); ts.close(); }
From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
License:Apache License
@Override public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {/*from w w w .ja va 2 s . co m*/ final FieldNameSelector srcSelector = getSourceSelector(); return new UpdateRequestProcessor(next) { private final NLPNERTaggerOp nerTaggerOp; private Analyzer analyzer = null; { try { nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile); FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType); if (fieldType == null) { throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema."); } analyzer = fieldType.getIndexAnalyzer(); } catch (IOException e) { throw new IllegalArgumentException(e); } } @Override public void processAdd(AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); // Destination may be regex replace string, or "{EntityType}" replaced by // each entity's type, both of which can cause multiple output fields. Map<String, SolrInputField> destMap = new HashMap<>(); // preserve initial values for (final String fname : doc.getFieldNames()) { if (!srcSelector.shouldMutate(fname)) continue; Collection<Object> srcFieldValues = doc.getFieldValues(fname); if (srcFieldValues == null || srcFieldValues.isEmpty()) continue; String resolvedDest = dest; if (pattern != null) { Matcher matcher = pattern.matcher(fname); if (matcher.find()) { resolvedDest = matcher.replaceAll(dest); } else { log.debug("srcSelector.shouldMutate(\"{}\") returned true, " + "but replacement pattern did not match, field skipped.", fname); continue; } } for (Object val : srcFieldValues) { for (Pair<String, String> entity : extractTypedNamedEntities(val)) { SolrInputField destField = null; String entityName = entity.first(); String entityType = entity.second(); resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType); if (doc.containsKey(resolvedDest)) { destField = doc.getField(resolvedDest); } else { SolrInputField targetField = destMap.get(resolvedDest); if (targetField == null) { destField = new SolrInputField(resolvedDest); } else { destField = targetField; } } destField.addValue(entityName); // put it in map to avoid concurrent modification... destMap.put(resolvedDest, destField); } } } for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) { doc.put(entry.getKey(), entry.getValue()); } super.processAdd(cmd); } /** Using configured NER model, extracts (name, type) pairs from the given source field value */ private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException { List<Pair<String, String>> entitiesWithType = new ArrayList<>(); List<String> terms = new ArrayList<>(); List<Integer> startOffsets = new ArrayList<>(); List<Integer> endOffsets = new ArrayList<>(); String fullText = srcFieldValue.toString(); TokenStream tokenStream = analyzer.tokenStream("", fullText); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); tokenStream.reset(); synchronized (nerTaggerOp) { while (tokenStream.incrementToken()) { terms.add(termAtt.toString()); startOffsets.add(offsetAtt.startOffset()); endOffsets.add(offsetAtt.endOffset()); boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT); if (endOfSentence) { // extract named entities one sentence at a time extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } } tokenStream.end(); tokenStream.close(); if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls } return entitiesWithType; } private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets, List<Integer> endOffsets, List<Pair<String, String>> entitiesWithType) { for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) { String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1)); entitiesWithType.add(new Pair<>(text, span.getType())); } terms.clear(); startOffsets.clear(); endOffsets.clear(); } }; }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testGeneral() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer general = analyzerManager.getGeneralAnalyzer(); TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); ts.reset();/*www.jav a 2 s . c om*/ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("the")); assertTrue(seen.contains("and")); assertTrue(seen.contains("dog")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testCommon() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer common = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog"); ts.reset();/*w w w . j ava 2 s . co m*/ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { String t = termAtt.toString(); if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) { fail("Shouldn't have found a numeric"); } seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("dirty")); assertFalse(seen.contains("the")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testTokenCountFilter() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 101000; i++) { sb.append("the "); }/*from w w w . ja v a2s . com*/ TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString()); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); int tokens = 0; while (ts.incrementToken()) { tokens++; } assertEquals(100000, tokens); }
From source file:org.apache.tika.eval.tokens.TokenCounter.java
License:Apache License
private void _add(String field, Analyzer analyzer, String content) throws IOException { int totalTokens = 0; TokenStream ts = analyzer.tokenStream(field, content); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();//from w w w .j a va 2 s . c o m Map<String, MutableInt> tokenMap = map.get(field); if (tokenMap == null) { tokenMap = new HashMap<>(); map.put(field, tokenMap); } while (ts.incrementToken()) { String token = termAtt.toString(); MutableInt cnt = tokenMap.get(token); if (cnt == null) { cnt = new MutableInt(1); tokenMap.put(token, cnt); } else { cnt.increment(); } totalTokens++; } ts.close(); ts.end(); int totalUniqueTokens = tokenMap.size(); double ent = 0.0d; double p = 0.0d; double base = 2.0; TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) { String token = e.getKey(); int termFreq = e.getValue().intValue(); p = (double) termFreq / (double) totalTokens; ent += p * FastMath.log(base, p); int len = token.codePointCount(0, token.length()); for (int i = 0; i < e.getValue().intValue(); i++) { summaryStatistics.addValue(len); } if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(token, termFreq)); } } if (totalTokens > 0) { ent = (-1.0d / (double) totalTokens) * ent; } /* Collections.sort(allTokens); List<TokenIntPair> topNList = new ArrayList<>(topN); for (int i = 0; i < topN && i < allTokens.size(); i++) { topNList.add(allTokens.get(i)); }*/ tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics)); }
From source file:org.apache.tika.eval.tokens.TokenCounterTest.java
License:Apache License
@Test public void testCJKFilter() throws Exception { String s = "then quickbrownfoxjumpedoverthelazy dogss dog "; Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = analyzer.tokenStream(FIELD, s); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();//from w w w .java 2s .c o m Map<String, Integer> tokens = new HashMap<>(); while (ts.incrementToken()) { String t = termAtt.toString(); Integer count = tokens.get(t); count = (count == null) ? count = 0 : count; count++; tokens.put(t, count); } ts.end(); ts.close(); assertEquals(7, tokens.size()); assertEquals(new Integer(1), tokens.get("")); }
From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java
License:Apache License
private void init() throws IOException { for (TokenStream stream : streams) { stream.reset();//w ww.ja v a 2s .c o m stream.incrementToken(); sortedStreams.add(stream); } rebuildSortedTokens(); initialized = true; }
From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java
License:Apache License
@Override public boolean incrementToken() throws IOException { if (!initialized) init();//from ww w.j av a2 s . c o m if (sortedStreams.size() == 0) return false; TokenStream currentTokenStream = sortedStreams.pop(); restoreState(currentTokenStream.captureState()); OffsetAttribute offsetAttr = (OffsetAttribute) currentTokenStream.getAttribute(OffsetAttribute.class); if (offsetAttr.startOffset() == currentOffset) posIncAtt.setPositionIncrement(0); else posIncAtt.setPositionIncrement(1); currentOffset = offsetAttr.startOffset(); // proceed the token stream to its next token and resort the stack if (currentTokenStream.incrementToken()) sortedStreams.add(currentTokenStream); rebuildSortedTokens(); return true; }