List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
License:Apache License
@Override public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {/*from w w w . j av a 2 s. c o m*/ final FieldNameSelector srcSelector = getSourceSelector(); return new UpdateRequestProcessor(next) { private final NLPNERTaggerOp nerTaggerOp; private Analyzer analyzer = null; { try { nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile); FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType); if (fieldType == null) { throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema."); } analyzer = fieldType.getIndexAnalyzer(); } catch (IOException e) { throw new IllegalArgumentException(e); } } @Override public void processAdd(AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); // Destination may be regex replace string, or "{EntityType}" replaced by // each entity's type, both of which can cause multiple output fields. Map<String, SolrInputField> destMap = new HashMap<>(); // preserve initial values for (final String fname : doc.getFieldNames()) { if (!srcSelector.shouldMutate(fname)) continue; Collection<Object> srcFieldValues = doc.getFieldValues(fname); if (srcFieldValues == null || srcFieldValues.isEmpty()) continue; String resolvedDest = dest; if (pattern != null) { Matcher matcher = pattern.matcher(fname); if (matcher.find()) { resolvedDest = matcher.replaceAll(dest); } else { log.debug("srcSelector.shouldMutate(\"{}\") returned true, " + "but replacement pattern did not match, field skipped.", fname); continue; } } for (Object val : srcFieldValues) { for (Pair<String, String> entity : extractTypedNamedEntities(val)) { SolrInputField destField = null; String entityName = entity.first(); String entityType = entity.second(); resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType); if (doc.containsKey(resolvedDest)) { destField = doc.getField(resolvedDest); } else { SolrInputField targetField = destMap.get(resolvedDest); if (targetField == null) { destField = new SolrInputField(resolvedDest); } else { destField = targetField; } } destField.addValue(entityName); // put it in map to avoid concurrent modification... destMap.put(resolvedDest, destField); } } } for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) { doc.put(entry.getKey(), entry.getValue()); } super.processAdd(cmd); } /** Using configured NER model, extracts (name, type) pairs from the given source field value */ private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException { List<Pair<String, String>> entitiesWithType = new ArrayList<>(); List<String> terms = new ArrayList<>(); List<Integer> startOffsets = new ArrayList<>(); List<Integer> endOffsets = new ArrayList<>(); String fullText = srcFieldValue.toString(); TokenStream tokenStream = analyzer.tokenStream("", fullText); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); tokenStream.reset(); synchronized (nerTaggerOp) { while (tokenStream.incrementToken()) { terms.add(termAtt.toString()); startOffsets.add(offsetAtt.startOffset()); endOffsets.add(offsetAtt.endOffset()); boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT); if (endOfSentence) { // extract named entities one sentence at a time extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } } tokenStream.end(); tokenStream.close(); if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls } return entitiesWithType; } private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets, List<Integer> endOffsets, List<Pair<String, String>> entitiesWithType) { for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) { String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1)); entitiesWithType.add(new Pair<>(text, span.getType())); } terms.clear(); startOffsets.clear(); endOffsets.clear(); } }; }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testGeneral() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer general = analyzerManager.getGeneralAnalyzer(); TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); ts.reset();//from w w w .ja v a2 s .c o m CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("the")); assertTrue(seen.contains("and")); assertTrue(seen.contains("dog")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testCommon() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer common = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog"); ts.reset();/*from ww w . j a v a2s. co m*/ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { String t = termAtt.toString(); if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) { fail("Shouldn't have found a numeric"); } seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("dirty")); assertFalse(seen.contains("the")); }
From source file:org.apache.tika.eval.tokens.TokenCounter.java
License:Apache License
private void _add(String field, Analyzer analyzer, String content) throws IOException { int totalTokens = 0; TokenStream ts = analyzer.tokenStream(field, content); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();/* w ww . j ava 2 s . c o m*/ Map<String, MutableInt> tokenMap = map.get(field); if (tokenMap == null) { tokenMap = new HashMap<>(); map.put(field, tokenMap); } while (ts.incrementToken()) { String token = termAtt.toString(); MutableInt cnt = tokenMap.get(token); if (cnt == null) { cnt = new MutableInt(1); tokenMap.put(token, cnt); } else { cnt.increment(); } totalTokens++; } ts.close(); ts.end(); int totalUniqueTokens = tokenMap.size(); double ent = 0.0d; double p = 0.0d; double base = 2.0; TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) { String token = e.getKey(); int termFreq = e.getValue().intValue(); p = (double) termFreq / (double) totalTokens; ent += p * FastMath.log(base, p); int len = token.codePointCount(0, token.length()); for (int i = 0; i < e.getValue().intValue(); i++) { summaryStatistics.addValue(len); } if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(token, termFreq)); } } if (totalTokens > 0) { ent = (-1.0d / (double) totalTokens) * ent; } /* Collections.sort(allTokens); List<TokenIntPair> topNList = new ArrayList<>(topN); for (int i = 0; i < topN && i < allTokens.size(); i++) { topNList.add(allTokens.get(i)); }*/ tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics)); }
From source file:org.apache.tika.eval.tokens.TokenCounterTest.java
License:Apache License
@Test public void testCJKFilter() throws Exception { String s = "then quickbrownfoxjumpedoverthelazy dogss dog "; Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = analyzer.tokenStream(FIELD, s); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();//w w w .j a va2 s .c o m Map<String, Integer> tokens = new HashMap<>(); while (ts.incrementToken()) { String t = termAtt.toString(); Integer count = tokens.get(t); count = (count == null) ? count = 0 : count; count++; tokens.put(t, count); } ts.end(); ts.close(); assertEquals(7, tokens.size()); assertEquals(new Integer(1), tokens.get("")); }
From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java
License:Open Source License
private static String analyzeTextToFind(String textToFind) throws IOException { // Filter textToFind through GreekAnalyzer TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind)); stream.reset();//from ww w. jav a 2s . c om StringBuilder analyzedTextTofind = new StringBuilder(); try { while (stream.incrementToken()) { String term = stream.getAttribute(TermAttribute.class).term(); analyzedTextTofind.append(term); analyzedTextTofind.append(" "); } } catch (IOException e) { e.printStackTrace(); analyzedTextTofind.append(textToFind); } finally { stream.end(); stream.close(); } String result = analyzedTextTofind.toString().trim(); if (StringUtils.isBlank(result)) return textToFind; return result; }
From source file:org.chombo.util.BasicUtils.java
License:Apache License
/** * Analyzes text and return analyzed text * @param text/*from www. j a v a2 s . co m*/ * @return * @throws IOException */ public static String analyze(String text, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); StringBuilder stBld = new StringBuilder(); stream.reset(); CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String token = termAttribute.toString(); stBld.append(token).append(" "); } stream.end(); stream.close(); return stBld.toString(); }
From source file:org.cosmo.common.util.WordUtil.java
License:Apache License
public static void main(String[] args) throws Exception { StringReader reader = new StringReader( "CNN, CNN news, CNN.com, CNN TV, news, news online, breaking news, U.S. news, world news, weather, business, CNN Money, sports, politics, law, technology, entertainment, education, travel, health, special reports, autos, developing story, news video, CNN Intl"); /*/* w w w . j av a2 s. com*/ LetterTokenizer tokenizer = new LetterTokenizer(reader); AttributeSource filter = new StopFilter(true, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); while (filter.hasAttributes()) { Attribute attribute = filter.captureState(). System.out.println(attribute); } */ StopAnalyzer analyzer = new StopAnalyzer(Index.Version); Set<String> uniqueTerms = new HashSet(); TokenStream tokenStream = analyzer.reusableTokenStream("anyting", reader); tokenStream.reset(); while (tokenStream.incrementToken()) { TermAttribute term = tokenStream.getAttribute(TermAttribute.class); uniqueTerms.add(term.term()); } tokenStream.end(); tokenStream.close(); System.out.println(Arrays.toString(uniqueTerms.toArray())); }
From source file:org.dbpedia.spotlight.lucene.analysis.NGramAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String myString = "cancer"; Analyzer analyzer = new NGramAnalyzer(3, 3); System.out.println("Analyzing: \"" + myString + "\""); StringReader reader = new StringReader(myString); TokenStream stream = analyzer.tokenStream("field", reader); // TokenStream stream = new NGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 1,2); stream.reset();//from w ww. jav a 2 s . co m // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println("token: " + stream); } stream.end(); stream.close(); }
From source file:org.dbpedia.spotlight.lucene.analysis.PhoneticAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String myString = "cancer"; Analyzer analyzer = new PhoneticAnalyzer(Version.LUCENE_36, SpotlightConfiguration.DEFAULT_STOPWORDS); System.out.println("Analyzing: \"" + myString + "\""); StringReader reader = new StringReader(myString); TokenStream stream = analyzer.tokenStream("field", reader); stream.reset();//from w ww . j a v a 2 s. c o m // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println("token: " + stream); } stream.end(); stream.close(); }