List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.apache.solr.analysis.author.TestAdsabsTypeAuthorParsing.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); assertU(adoc(F.ID, "1", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk,")); assertU(adoc(F.ID, "2", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, M.")); assertU(adoc(F.ID, "3", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, Marel")); assertU(adoc(F.ID, "4", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, Molja")); assertU(adoc(F.ID, "5", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, Molja Karel")); assertU(adoc(F.ID, "6", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, M Karel")); assertU(adoc(F.ID, "7", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, Molja K")); assertU(adoc(F.ID, "8", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, M K")); assertU(adoc(F.ID, "9", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, Karel Molja")); assertU(adoc(F.ID, "10", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, Karel M")); assertU(adoc(F.ID, "11", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adam?uk, K Molja")); assertU(adoc(F.ID, "20", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk,")); assertU(adoc(F.ID, "21", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, M.")); assertU(adoc(F.ID, "22", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Marel")); assertU(adoc(F.ID, "23", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Molja")); assertU(adoc(F.ID, "24", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Molja Karel")); assertU(adoc(F.ID, "25", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, M Karel")); assertU(adoc(F.ID, "26", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Molja K")); assertU(adoc(F.ID, "27", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, M K")); assertU(adoc(F.ID, "28", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Karel Molja")); assertU(adoc(F.ID, "29", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Karel M")); assertU(adoc(F.ID, "30", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, K Molja")); assertU(adoc(F.ID, "40", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk,")); assertU(adoc(F.ID, "41", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, M.")); assertU(adoc(F.ID, "42", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Marel")); assertU(adoc(F.ID, "43", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Molja")); assertU(adoc(F.ID, "44", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Molja Karel")); assertU(adoc(F.ID, "45", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, M Karel")); assertU(adoc(F.ID, "46", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Molja K")); assertU(adoc(F.ID, "47", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, M K")); assertU(adoc(F.ID, "48", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Karel Molja")); assertU(adoc(F.ID, "49", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Karel M")); assertU(adoc(F.ID, "50", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, K Molja")); assertU(adoc(F.ID, "60", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk,")); assertU(adoc(F.ID, "61", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, M.")); assertU(adoc(F.ID, "62", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Marel")); assertU(adoc(F.ID, "63", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Molja")); assertU(adoc(F.ID, "64", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Molja Karel")); assertU(adoc(F.ID, "65", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, M Karel")); assertU(adoc(F.ID, "66", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Molja K")); assertU(adoc(F.ID, "67", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, M K")); assertU(adoc(F.ID, "68", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Karel Molja")); assertU(adoc(F.ID, "69", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Karel M")); assertU(adoc(F.ID, "70", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, K Molja")); assertU(adoc(F.ID, "80", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk,")); assertU(adoc(F.ID, "81", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, M.")); assertU(adoc(F.ID, "82", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Marel")); assertU(adoc(F.ID, "83", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Molja")); assertU(adoc(F.ID, "84", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Molja Karel")); assertU(adoc(F.ID, "85", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, M Karel")); assertU(adoc(F.ID, "86", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Molja K")); assertU(adoc(F.ID, "87", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, M K")); assertU(adoc(F.ID, "88", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Karel Molja")); assertU(adoc(F.ID, "89", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Karel M")); assertU(adoc(F.ID, "90", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, K Molja")); assertU(adoc(F.ID, "100", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Mller, William")); assertU(adoc(F.ID, "101", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Mueller, William")); assertU(adoc(F.ID, "110", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, Christine")); assertU(adoc(F.ID, "111", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, C")); assertU(adoc(F.ID, "112", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, Christine")); assertU(adoc(F.ID, "113", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, C")); assertU(adoc(F.ID, "114", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, Christopher")); assertU(adoc(F.ID, "115", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, C")); assertU(adoc(F.ID, "116", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, Christopher")); assertU(adoc(F.ID, "117", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, C")); //"ALLEN, LYNNE;ALLEN, R LYNNE;JONES, LYNNE;JONES, R LYNNE" assertU(adoc(F.ID, "120", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, Lynne")); assertU(adoc(F.ID, "121", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, L")); assertU(adoc(F.ID, "122", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, R Lynne")); assertU(adoc(F.ID, "123", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, R L")); assertU(adoc(F.ID, "124", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, Lynne")); assertU(adoc(F.ID, "125", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, L")); assertU(adoc(F.ID, "126", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, R Lynne")); assertU(adoc(F.ID, "127", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, R L")); assertU(adoc(F.ID, "130", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Author, A", F.AUTHOR, "Author, B", F.AUTHOR, "Author, C")); assertU(adoc(F.ID, "200", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, H C")); assertU(adoc(F.ID, "201", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, H-C")); assertU(adoc(F.ID, "202", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, Harwin-C")); assertU(adoc(F.ID, "203", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, Harwin-Costa")); assertU(adoc(F.ID, "210", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso")); // just surname assertU(adoc(F.ID, "211", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso,")); assertU(adoc(F.ID, "212", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, B")); assertU(adoc(F.ID, "213", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, Brava")); assertU(adoc(F.ID, "214", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, Borat")); assertU(adoc(F.ID, "215", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, Amer")); assertU(adoc(F.ID, "220", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum")); assertU(adoc(F.ID, "221", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum,")); assertU(adoc(F.ID, "222", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, H")); assertU(adoc(F.ID, "223", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, Hector")); assertU(adoc(F.ID, "224", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, Hiatus")); assertU(adoc(F.ID, "225", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, Romulus")); assertU(adoc(F.ID, "230", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Bser", "first_author", "Bser, S")); assertU(adoc(F.ID, "231", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Bser, S")); assertU(adoc(F.ID, "232", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Boser, S")); assertU(adoc(F.ID, "233", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Boser,")); assertU(adoc(F.ID, "300", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Gopal-Krishna,")); assertU(adoc(F.ID, "301", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Gopal-Krishna, Jewell")); assertU(adoc(F.ID, "302", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Gopal-Krishna, J")); assertU(commit());/*from www. j a v a2 s . c o m*/ // persist the transliteration map after new docs were indexed // and reload synonym chain harvested during indexing Analyzer iAnalyzer = h.getCore().getLatestSchema().getAnalyzer(); Analyzer qAnalyzer = h.getCore().getLatestSchema().getQueryAnalyzer(); TokenStream iAuthor = iAnalyzer.tokenStream("author", new StringReader("")); TokenStream qAuthor = qAnalyzer.tokenStream("author", new StringReader("")); iAuthor.close(); qAuthor.close(); // TODO: force reload of the synonym map //h.getCoreContainer().reload("collection1"); }
From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java
License:Apache License
private void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int posIncs[]) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts .getAttribute(PositionIncrementAttribute.class); for (int i = 0; i < output.length; i++) { assertTrue(ts.incrementToken()); assertEquals(output[i], termAtt.term()); assertEquals(startOffsets[i], offsetAtt.startOffset()); assertEquals(endOffsets[i], offsetAtt.endOffset()); assertEquals(posIncs[i], posIncAtt.getPositionIncrement()); }/* w w w .j av a2 s . c om*/ assertFalse(ts.incrementToken()); ts.close(); }
From source file:org.apache.solr.handler.ClassifyStream.java
License:Apache License
@Override public Tuple read() throws IOException { if (modelTuple == null) { modelTuple = modelStream.read(); if (modelTuple == null || modelTuple.EOF) { throw new IOException("Model tuple not found for classify stream!"); }/*from ww w . ja v a 2 s . co m*/ termToIndex = new HashMap<>(); List<String> terms = modelTuple.getStrings("terms_ss"); for (int i = 0; i < terms.size(); i++) { termToIndex.put(terms.get(i), i); } idfs = modelTuple.getDoubles("idfs_ds"); modelWeights = modelTuple.getDoubles("weights_ds"); } Tuple docTuple = docStream.read(); if (docTuple.EOF) return docTuple; String text = docTuple.getString(field); double tfs[] = new double[termToIndex.size()]; TokenStream tokenStream = analyzer.tokenStream(analyzerField, text); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); int termCount = 0; while (tokenStream.incrementToken()) { termCount++; if (termToIndex.containsKey(termAtt.toString())) { tfs[termToIndex.get(termAtt.toString())]++; } } tokenStream.end(); tokenStream.close(); List<Double> tfidfs = new ArrayList<>(termToIndex.size()); tfidfs.add(1.0); for (int i = 0; i < tfs.length; i++) { if (tfs[i] != 0) { tfs[i] = 1 + Math.log(tfs[i]); } tfidfs.add(this.idfs.get(i) * tfs[i]); } double total = 0.0; for (int i = 0; i < tfidfs.size(); i++) { total += tfidfs.get(i) * modelWeights.get(i); } double score = total * ((float) (1.0 / Math.sqrt(termCount))); double positiveProb = sigmoid(total); docTuple.put("probability_d", positiveProb); docTuple.put("score_d", score); return docTuple; }
From source file:org.apache.solr.handler.component.WordCloudComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrQueryRequest req = rb.req;/*www . ja va 2 s .co m*/ SolrParams params = req.getParams(); if (!params.getBool(COMPONENT_NAME, true)) { return; } String wcFields = null; if ((wcFields = params.get("wordcloud.fl", null)) == null) { return; } Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ',')); DocList ids = rb.getResults().docList; SolrIndexSearcher searcher = rb.req.getSearcher(); IndexSchema schema = rb.req.getCore().getLatestSchema(); final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer(); final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>(); CharTermAttribute termAtt; Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>(); for (String f : flds) { SchemaField field = schema.getFieldOrNull(f); if (field == null || !field.stored()) { continue; // ignore this field } fieldsToLoad.put(f, field.getType()); tokens.put(f, new HashMap<String, Integer>()); } DocIterator iterator = ids.iterator(); String w; Integer v; int sz = ids.size(); for (int i = 0; i < sz; i++) { int id = iterator.nextDoc(); Document doc = searcher.doc(id, fieldsToLoad.keySet()); for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) { Map<String, Integer> toks = tokens.get(en.getKey()); String[] vals = doc.getValues(en.getKey()); FieldType fType = en.getValue(); if (vals != null) { for (String s : vals) { TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s))); if (!buffer.hasAttribute(CharTermAttribute.class)) { continue; // empty stream } termAtt = buffer.getAttribute(CharTermAttribute.class); buffer.reset(); while (buffer.incrementToken()) { w = termAtt.toString(); v = toks.get(w); if (v == null) v = 0; toks.put(w, ++v); } buffer.close(); } } } } // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency) AtomicReader reader = searcher.getAtomicReader(); BytesRef term; int df; String f; Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>(); for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) { HashMap<String, Double> idfs = new HashMap<String, Double>(); f = field.getKey(); docFreqs.put(f, idfs); int N = reader.getDocCount(f); for (Entry<String, Integer> token : field.getValue().entrySet()) { w = token.getKey(); df = reader.docFreq(new Term(f, new BytesRef(w))); if (df != 0) { idfs.put(w, Math.log10(N / df)); } } } HashMap<String, Object> ret = new HashMap<String, Object>(); for (String fi : fieldsToLoad.keySet()) { HashMap<String, Object> va = new HashMap<String, Object>(); va.put("tf", tokens.get(fi)); va.put("idf", docFreqs.get(fi)); ret.put(fi, va); } rb.rsp.add("wordcloud", ret); }
From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java
License:Apache License
private void assertNumericContents(int value, TokenStream ts) throws IOException { assertTrue(ts instanceof LegacyNumericTokenStream); LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class); ts.reset();//w ww. jav a 2 s . c om boolean seen = false; while (ts.incrementToken()) { if (numericAtt.getShift() == 0) { assertEquals(value, numericAtt.getRawValue()); seen = true; } } ts.end(); ts.close(); assertTrue(seen); }
From source file:org.apache.solr.spelling.SpellingQueryConverter.java
License:Apache License
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset();/*from w w w.j ava 2 s.c o m*/ while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
From source file:org.apache.solr.TestTrie.java
License:Apache License
@Test public void testTokenizer() throws Exception { FieldType type = h.getCore().getLatestSchema().getFieldType("tint"); assertTrue(type instanceof TrieField); String value = String.valueOf(random().nextInt()); TokenStream ts = type.getAnalyzer().tokenStream("dummy", value); OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class); ts.reset();//from ww w . ja v a 2 s . c o m int count = 0; while (ts.incrementToken()) { count++; assertEquals(0, ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); } final int precStep = ((TrieField) type).getPrecisionStep(); assertEquals((32 + precStep - 1) / precStep, count); ts.end(); assertEquals(value.length(), ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); ts.close(); // Test empty one: ts = type.getAnalyzer().tokenStream("dummy", ""); ts.reset(); assertFalse(ts.incrementToken()); ts.end(); assertEquals(0, ofsAtt.startOffset()); assertEquals(0, ofsAtt.endOffset()); ts.close(); }
From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
License:Apache License
@Override public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {/*from w w w . j av a 2 s. com*/ final FieldNameSelector srcSelector = getSourceSelector(); return new UpdateRequestProcessor(next) { private final NLPNERTaggerOp nerTaggerOp; private Analyzer analyzer = null; { try { nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile); FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType); if (fieldType == null) { throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema."); } analyzer = fieldType.getIndexAnalyzer(); } catch (IOException e) { throw new IllegalArgumentException(e); } } @Override public void processAdd(AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); // Destination may be regex replace string, or "{EntityType}" replaced by // each entity's type, both of which can cause multiple output fields. Map<String, SolrInputField> destMap = new HashMap<>(); // preserve initial values for (final String fname : doc.getFieldNames()) { if (!srcSelector.shouldMutate(fname)) continue; Collection<Object> srcFieldValues = doc.getFieldValues(fname); if (srcFieldValues == null || srcFieldValues.isEmpty()) continue; String resolvedDest = dest; if (pattern != null) { Matcher matcher = pattern.matcher(fname); if (matcher.find()) { resolvedDest = matcher.replaceAll(dest); } else { log.debug("srcSelector.shouldMutate(\"{}\") returned true, " + "but replacement pattern did not match, field skipped.", fname); continue; } } for (Object val : srcFieldValues) { for (Pair<String, String> entity : extractTypedNamedEntities(val)) { SolrInputField destField = null; String entityName = entity.first(); String entityType = entity.second(); resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType); if (doc.containsKey(resolvedDest)) { destField = doc.getField(resolvedDest); } else { SolrInputField targetField = destMap.get(resolvedDest); if (targetField == null) { destField = new SolrInputField(resolvedDest); } else { destField = targetField; } } destField.addValue(entityName); // put it in map to avoid concurrent modification... destMap.put(resolvedDest, destField); } } } for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) { doc.put(entry.getKey(), entry.getValue()); } super.processAdd(cmd); } /** Using configured NER model, extracts (name, type) pairs from the given source field value */ private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException { List<Pair<String, String>> entitiesWithType = new ArrayList<>(); List<String> terms = new ArrayList<>(); List<Integer> startOffsets = new ArrayList<>(); List<Integer> endOffsets = new ArrayList<>(); String fullText = srcFieldValue.toString(); TokenStream tokenStream = analyzer.tokenStream("", fullText); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); tokenStream.reset(); synchronized (nerTaggerOp) { while (tokenStream.incrementToken()) { terms.add(termAtt.toString()); startOffsets.add(offsetAtt.startOffset()); endOffsets.add(offsetAtt.endOffset()); boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT); if (endOfSentence) { // extract named entities one sentence at a time extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } } tokenStream.end(); tokenStream.close(); if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls } return entitiesWithType; } private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets, List<Integer> endOffsets, List<Pair<String, String>> entitiesWithType) { for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) { String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1)); entitiesWithType.add(new Pair<>(text, span.getType())); } terms.clear(); startOffsets.clear(); endOffsets.clear(); } }; }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testGeneral() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer general = analyzerManager.getGeneralAnalyzer(); TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); ts.reset();//from w w w .jav a 2 s. c o m CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("the")); assertTrue(seen.contains("and")); assertTrue(seen.contains("dog")); }
From source file:org.apache.tika.eval.AnalyzerManagerTest.java
License:Apache License
@Test public void testCommon() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); Analyzer common = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog"); ts.reset();//from w ww. ja v a 2 s. c o m CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { String t = termAtt.toString(); if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) { fail("Shouldn't have found a numeric"); } seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("dirty")); assertFalse(seen.contains("the")); }