List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.apache.solr.highlight.HighlighterTest.java
License:Apache License
@Test public void testTermOffsetsTokenStream() throws Exception { String[] multivalued = { "a b c d", "e f g", "h", "i j k l m n" }; Analyzer a1 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); TokenStream tokenStream = a1.tokenStream("", "a b c d e f g h i j k l m n"); tokenStream.reset(); TermOffsetsTokenStream tots = new TermOffsetsTokenStream(tokenStream); for (String v : multivalued) { TokenStream ts1 = tots.getMultiValuedTokenStream(v.length()); Analyzer a2 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); TokenStream ts2 = a2.tokenStream("", v); ts2.reset();/*w w w .ja v a 2s . c o m*/ while (ts1.incrementToken()) { assertTrue(ts2.incrementToken()); assertEquals(ts1, ts2); } assertFalse(ts2.incrementToken()); } }
From source file:org.apache.solr.highlight.ParsedContentSolrHighlighter.java
License:Apache License
/** * Generates a list of Highlighted query fragments for each item in a list * of documents, or returns null if highlighting is disabled. * //from w w w. j a v a2s . c o m * @param docs * query results * @param query * the query * @param req * the current request * @param defaultFields * default list of fields to summarize * @return NamedList containing a NamedList for each document, which in * turns contains sets (field, summary) pairs. */ @SuppressWarnings("unchecked") public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException { SolrParams params = req.getParams(); if (!isHighlightingEnabled(params)) return null; SolrIndexSearcher searcher = req.getSearcher(); IndexSchema schema = searcher.getSchema(); NamedList fragments = new SimpleOrderedMap(); String[] fieldNames = getHighlightFields(query, req, defaultFields); Document[] readDocs = new Document[docs.size()]; { // pre-fetch documents using the Searcher's doc cache Set<String> fset = new HashSet<String>(); for (String f : fieldNames) { fset.add(f); } // fetch unique key if one exists. SchemaField keyField = schema.getUniqueKeyField(); if (null != keyField) fset.add(keyField.getName()); searcher.readDocs(readDocs, docs, fset); } // Highlight each document DocIterator iterator = docs.iterator(); for (int i = 0; i < docs.size(); i++) { int docId = iterator.nextDoc(); Document doc = readDocs[i]; NamedList docSummaries = new SimpleOrderedMap(); for (String fieldName : fieldNames) { fieldName = fieldName.trim(); // begin String[] docTexts = doc.getValues(fieldName); //Highlight only the parsed content, instead of all fields if (IndexField.DEFAULT_SEARCH_FIELD.equals(fieldName)) { docTexts = doc.getValues(IndexField.PARSED_CONTENT_FIELD); } // IndexFieldServices indexFieldServices = ConstellioSpringUtils.getIndexFieldServices(); // String collectionName = params.get(ConstellioSolrQueryParams.COLLECTION_NAME); // RecordCollectionServices collectionServices = ConstellioSpringUtils.getRecordCollectionServices(); // RecordCollection collection = collectionServices.get(collectionName); // IndexField defaultSearchField = collection.getDefaultSearchIndexField(); // // List<String> defaultSearchFieldDocTextsList = new ArrayList<String>(); // for (CopyField copyField : defaultSearchField.getCopyFieldsDest()) { // IndexField sourceIndexField = copyField.getIndexFieldSource(); // if (sourceIndexField != null) { // String sourceIndexFieldName = sourceIndexField.getName(); // String[] copyFieldValues = doc.getValues(sourceIndexFieldName); // if (copyFieldValues != null) { // for (int k = 0; k < copyFieldValues.length; k++) { // String copyFieldValue = copyFieldValues[k]; // if (!defaultSearchFieldDocTextsList.contains(copyFieldValue)) { // defaultSearchFieldDocTextsList.add(copyFieldValue); // } // } // } // } // } // docTexts = defaultSearchFieldDocTextsList.toArray(new String[0]); // if ((docTexts == null || docTexts.length == 0)) { // RecordServices recordServices = ConstellioSpringUtils.getRecordServices(); // Long recordId = new Long(doc.getField(IndexField.RECORD_ID_FIELD).stringValue()); // Record record; // try { // record = recordServices.get(recordId, collection); // } catch (Exception e) { // record = null; // e.printStackTrace(); // } // if (record != null) { // List<Object> fieldValues = indexFieldServices.extractFieldValues(record, defaultSearchField); // // List<String> docTextsList = new ArrayList<String>(); // for (Object fieldValue : fieldValues) { // String strFieldValue = fieldValue != null ? fieldValue.toString() : null; // if (StringUtils.isNotBlank(strFieldValue)) { // docTextsList.add(strFieldValue); // } // } // // if (!docTextsList.isEmpty()) { // docTexts = docTextsList.toArray(new String[0]); // } // } // } // // end if (docTexts == null) continue; TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); for (int j = 0; j < docTexts.length; j++) { // create TokenStream try { // attempt term vectors tstream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); } catch (IllegalArgumentException e) { // fall back to anaylzer tstream = new TokenOrderingFilter( schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10); } Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) { // wrap CachingTokenFilter around TokenStream for reuse tstream = new CachingTokenFilter(tstream); // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter // already used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } catch (InvalidTokenOffsetsException e) { throw new RuntimeException(e); } } // sort such that the fragments with the highest score come first Collections.sort(frags, new Comparator<TextFragment>() { public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if ((fragment != null) && (fragment.getScore() > 0)) { // fragTexts.add(fragment.toString()); fragTexts.add(StringEscapeUtils.escapeHtml(fragment.toString())); } if (fragTexts.size() >= numFragments) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { String alternateField = req.getParams().getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD); if (alternateField != null && alternateField.length() > 0) { String[] altTexts = doc.getValues(alternateField); if (altTexts != null && altTexts.length > 0) { int alternateFieldLen = req.getParams().getFieldInt(fieldName, HighlightParams.ALTERNATE_FIELD_LENGTH, 0); if (alternateFieldLen <= 0) { docSummaries.add(fieldName, altTexts); } else { List<String> altList = new ArrayList<String>(); int len = 0; for (String altText : altTexts) { altList.add(len + altText.length() > alternateFieldLen ? altText.substring(0, alternateFieldLen - len) : altText); len += altText.length(); if (len >= alternateFieldLen) break; } docSummaries.add(fieldName, altList); } } } } } String printId = schema.printableUniqueKey(doc); fragments.add(printId == null ? null : printId, docSummaries); } return fragments; }
From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java
License:Apache License
private void assertNumericContents(int value, TokenStream ts) throws IOException { assertTrue(ts instanceof LegacyNumericTokenStream); LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class); ts.reset(); boolean seen = false; while (ts.incrementToken()) { if (numericAtt.getShift() == 0) { assertEquals(value, numericAtt.getRawValue()); seen = true;// www . j a v a 2 s. co m } } ts.end(); ts.close(); assertTrue(seen); }
From source file:org.apache.solr.schema.CollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because jdk collators might not be thread safe (when they are * its just that all methods are synced), this keeps things * simple (we already have a threadlocal clone in the reused TS) *//*from www. j av a 2 s . c om*/ private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable to analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.schema.EntityTextField.java
License:Apache License
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; TokenStream source = null; try {/*from w ww . j a v a 2 s .co m*/ source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.apache.solr.schema.ICUCollationField.java
License:Apache License
/** * analyze the range with the analyzer, instead of the collator. * because icu collators are not thread safe, this keeps things * simple (we already have a threadlocal clone in the reused TS) *//*from w w w . ja v a 2 s .c o m*/ private BytesRef analyzeRangePart(String field, String part) { TokenStream source = null; try { source = analyzer.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); // we control the analyzer here: most errors are impossible if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for range part: " + part); termAtt.fillBytesRef(); assert !source.incrementToken(); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Unable analyze range part: " + part, e); } finally { IOUtils.closeQuietly(source); } }
From source file:org.apache.solr.spelling.SimpleQueryConverter.java
License:Apache License
@Override public Collection<Token> convert(String origQuery) { Collection<Token> result = new HashSet<Token>(); WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40); TokenStream ts = null; try {/*from www . j ava 2s . co m*/ ts = analyzer.tokenStream("", origQuery); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { Token tok = new Token(); tok.copyBuffer(termAtt.buffer(), 0, termAtt.length()); tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); tok.setFlags(flagsAtt.getFlags()); tok.setPayload(payloadAtt.getPayload()); tok.setPositionIncrement(posIncAtt.getPositionIncrement()); tok.setType(typeAtt.type()); result.add(tok); } ts.end(); return result; } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.apache.solr.spelling.SpellingQueryConverter.java
License:Apache License
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type());/*from w ww . ja va 2 s. c o m*/ token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
From source file:org.apache.solr.TestTrie.java
License:Apache License
@Test public void testTokenizer() throws Exception { FieldType type = h.getCore().getLatestSchema().getFieldType("tint"); assertTrue(type instanceof TrieField); String value = String.valueOf(random().nextInt()); TokenStream ts = type.getAnalyzer().tokenStream("dummy", value); OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); int count = 0; while (ts.incrementToken()) { count++;//from w w w. j av a 2 s . c om assertEquals(0, ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); } final int precStep = ((TrieField) type).getPrecisionStep(); assertEquals((32 + precStep - 1) / precStep, count); ts.end(); assertEquals(value.length(), ofsAtt.startOffset()); assertEquals(value.length(), ofsAtt.endOffset()); ts.close(); // Test empty one: ts = type.getAnalyzer().tokenStream("dummy", ""); ts.reset(); assertFalse(ts.incrementToken()); ts.end(); assertEquals(0, ofsAtt.startOffset()); assertEquals(0, ofsAtt.endOffset()); ts.close(); }
From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
License:Apache License
@Override public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {//from w w w .j a v a 2s. c o m final FieldNameSelector srcSelector = getSourceSelector(); return new UpdateRequestProcessor(next) { private final NLPNERTaggerOp nerTaggerOp; private Analyzer analyzer = null; { try { nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile); FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType); if (fieldType == null) { throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema."); } analyzer = fieldType.getIndexAnalyzer(); } catch (IOException e) { throw new IllegalArgumentException(e); } } @Override public void processAdd(AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); // Destination may be regex replace string, or "{EntityType}" replaced by // each entity's type, both of which can cause multiple output fields. Map<String, SolrInputField> destMap = new HashMap<>(); // preserve initial values for (final String fname : doc.getFieldNames()) { if (!srcSelector.shouldMutate(fname)) continue; Collection<Object> srcFieldValues = doc.getFieldValues(fname); if (srcFieldValues == null || srcFieldValues.isEmpty()) continue; String resolvedDest = dest; if (pattern != null) { Matcher matcher = pattern.matcher(fname); if (matcher.find()) { resolvedDest = matcher.replaceAll(dest); } else { log.debug("srcSelector.shouldMutate(\"{}\") returned true, " + "but replacement pattern did not match, field skipped.", fname); continue; } } for (Object val : srcFieldValues) { for (Pair<String, String> entity : extractTypedNamedEntities(val)) { SolrInputField destField = null; String entityName = entity.first(); String entityType = entity.second(); resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType); if (doc.containsKey(resolvedDest)) { destField = doc.getField(resolvedDest); } else { SolrInputField targetField = destMap.get(resolvedDest); if (targetField == null) { destField = new SolrInputField(resolvedDest); } else { destField = targetField; } } destField.addValue(entityName); // put it in map to avoid concurrent modification... destMap.put(resolvedDest, destField); } } } for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) { doc.put(entry.getKey(), entry.getValue()); } super.processAdd(cmd); } /** Using configured NER model, extracts (name, type) pairs from the given source field value */ private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException { List<Pair<String, String>> entitiesWithType = new ArrayList<>(); List<String> terms = new ArrayList<>(); List<Integer> startOffsets = new ArrayList<>(); List<Integer> endOffsets = new ArrayList<>(); String fullText = srcFieldValue.toString(); TokenStream tokenStream = analyzer.tokenStream("", fullText); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class); tokenStream.reset(); synchronized (nerTaggerOp) { while (tokenStream.incrementToken()) { terms.add(termAtt.toString()); startOffsets.add(offsetAtt.startOffset()); endOffsets.add(offsetAtt.endOffset()); boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT); if (endOfSentence) { // extract named entities one sentence at a time extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } } tokenStream.end(); tokenStream.close(); if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType); } nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls } return entitiesWithType; } private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets, List<Integer> endOffsets, List<Pair<String, String>> entitiesWithType) { for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) { String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1)); entitiesWithType.add(new Pair<>(text, span.getType())); } terms.clear(); startOffsets.clear(); endOffsets.clear(); } }; }