Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.apache.solr.highlight.HighlighterTest.java

License:Apache License

@Test
public void testTermOffsetsTokenStream() throws Exception {
    String[] multivalued = { "a b c d", "e f g", "h", "i j k l m n" };
    Analyzer a1 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
    TokenStream tokenStream = a1.tokenStream("", "a b c d e f g h i j k l m n");
    tokenStream.reset();

    TermOffsetsTokenStream tots = new TermOffsetsTokenStream(tokenStream);
    for (String v : multivalued) {
        TokenStream ts1 = tots.getMultiValuedTokenStream(v.length());
        Analyzer a2 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
        TokenStream ts2 = a2.tokenStream("", v);
        ts2.reset();/*w w w .ja  v a 2s .  c  o  m*/

        while (ts1.incrementToken()) {
            assertTrue(ts2.incrementToken());
            assertEquals(ts1, ts2);
        }
        assertFalse(ts2.incrementToken());
    }
}

From source file:org.apache.solr.highlight.ParsedContentSolrHighlighter.java

License:Apache License

/**
 * Generates a list of Highlighted query fragments for each item in a list
 * of documents, or returns null if highlighting is disabled.
 * //from   w  w w. j  a v  a2s  . c  o m
 * @param docs
 *            query results
 * @param query
 *            the query
 * @param req
 *            the current request
 * @param defaultFields
 *            default list of fields to summarize
 * @return NamedList containing a NamedList for each document, which in
 *         turns contains sets (field, summary) pairs.
 */
@SuppressWarnings("unchecked")
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields)
        throws IOException {
    SolrParams params = req.getParams();
    if (!isHighlightingEnabled(params))
        return null;

    SolrIndexSearcher searcher = req.getSearcher();
    IndexSchema schema = searcher.getSchema();
    NamedList fragments = new SimpleOrderedMap();
    String[] fieldNames = getHighlightFields(query, req, defaultFields);
    Document[] readDocs = new Document[docs.size()];
    {
        // pre-fetch documents using the Searcher's doc cache
        Set<String> fset = new HashSet<String>();
        for (String f : fieldNames) {
            fset.add(f);
        }
        // fetch unique key if one exists.
        SchemaField keyField = schema.getUniqueKeyField();
        if (null != keyField)
            fset.add(keyField.getName());
        searcher.readDocs(readDocs, docs, fset);
    }

    // Highlight each document
    DocIterator iterator = docs.iterator();
    for (int i = 0; i < docs.size(); i++) {
        int docId = iterator.nextDoc();
        Document doc = readDocs[i];
        NamedList docSummaries = new SimpleOrderedMap();
        for (String fieldName : fieldNames) {
            fieldName = fieldName.trim();

            // begin
            String[] docTexts = doc.getValues(fieldName);
            //Highlight only the parsed content, instead of all fields
            if (IndexField.DEFAULT_SEARCH_FIELD.equals(fieldName)) {
                docTexts = doc.getValues(IndexField.PARSED_CONTENT_FIELD);
            }

            //                IndexFieldServices indexFieldServices = ConstellioSpringUtils.getIndexFieldServices();
            //                String collectionName = params.get(ConstellioSolrQueryParams.COLLECTION_NAME);
            //               RecordCollectionServices collectionServices = ConstellioSpringUtils.getRecordCollectionServices();
            //                RecordCollection collection = collectionServices.get(collectionName);
            //                IndexField defaultSearchField = collection.getDefaultSearchIndexField();
            //
            //                List<String> defaultSearchFieldDocTextsList = new ArrayList<String>();
            //                for (CopyField copyField : defaultSearchField.getCopyFieldsDest()) {
            //               IndexField sourceIndexField = copyField.getIndexFieldSource();
            //               if (sourceIndexField != null) {
            //                  String sourceIndexFieldName = sourceIndexField.getName();
            //                      String[] copyFieldValues = doc.getValues(sourceIndexFieldName);
            //                      if (copyFieldValues != null) {
            //                         for (int k = 0; k < copyFieldValues.length; k++) {
            //                        String copyFieldValue = copyFieldValues[k];
            //                        if (!defaultSearchFieldDocTextsList.contains(copyFieldValue)) {
            //                           defaultSearchFieldDocTextsList.add(copyFieldValue);
            //                        }
            //                     }
            //                      }
            //               }
            //            }
            //                docTexts = defaultSearchFieldDocTextsList.toArray(new String[0]);

            //                if ((docTexts == null || docTexts.length == 0)) {
            //                    RecordServices recordServices = ConstellioSpringUtils.getRecordServices();
            //                    Long recordId = new Long(doc.getField(IndexField.RECORD_ID_FIELD).stringValue());
            //                    Record record;
            //                    try {
            //                       record = recordServices.get(recordId, collection);
            //               } catch (Exception e) {
            //                  record = null;
            //                  e.printStackTrace();
            //               }
            //                    if (record != null) {
            //                        List<Object> fieldValues = indexFieldServices.extractFieldValues(record, defaultSearchField);
            //
            //                        List<String> docTextsList = new ArrayList<String>();
            //                        for (Object fieldValue : fieldValues) {
            //                            String strFieldValue = fieldValue != null ? fieldValue.toString() : null;
            //                            if (StringUtils.isNotBlank(strFieldValue)) {
            //                                docTextsList.add(strFieldValue);
            //                            }
            //                        }
            //
            //                        if (!docTextsList.isEmpty()) {
            //                            docTexts = docTextsList.toArray(new String[0]);
            //                        }
            //                    }
            //                }
            //                // end

            if (docTexts == null)
                continue;

            TokenStream tstream = null;
            int numFragments = getMaxSnippets(fieldName, params);
            boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

            String[] summaries = null;
            List<TextFragment> frags = new ArrayList<TextFragment>();
            for (int j = 0; j < docTexts.length; j++) {
                // create TokenStream
                try {
                    // attempt term vectors
                    tstream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId,
                            fieldName);
                } catch (IllegalArgumentException e) {
                    // fall back to anaylzer
                    tstream = new TokenOrderingFilter(
                            schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10);
                }

                Highlighter highlighter;
                if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) {
                    // wrap CachingTokenFilter around TokenStream for reuse
                    tstream = new CachingTokenFilter(tstream);

                    // get highlighter
                    highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

                    // after highlighter initialization, reset tstream since construction of highlighter
                    // already used it
                    tstream.reset();
                } else {
                    // use "the old way"
                    highlighter = getHighlighter(query, fieldName, req);
                }

                int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                        Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
                if (maxCharsToAnalyze < 0) {
                    highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
                } else {
                    highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
                }

                try {
                    TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, docTexts[j],
                            mergeContiguousFragments, numFragments);
                    for (int k = 0; k < bestTextFragments.length; k++) {
                        if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                            frags.add(bestTextFragments[k]);
                        }
                    }
                } catch (InvalidTokenOffsetsException e) {
                    throw new RuntimeException(e);
                }
            }
            // sort such that the fragments with the highest score come first
            Collections.sort(frags, new Comparator<TextFragment>() {
                public int compare(TextFragment arg0, TextFragment arg1) {
                    return Math.round(arg1.getScore() - arg0.getScore());
                }
            });

            // convert fragments back into text
            // TODO: we can include score and position information in output as snippet attributes
            if (frags.size() > 0) {
                ArrayList<String> fragTexts = new ArrayList<String>();
                for (TextFragment fragment : frags) {
                    if ((fragment != null) && (fragment.getScore() > 0)) {
                        //                            fragTexts.add(fragment.toString());
                        fragTexts.add(StringEscapeUtils.escapeHtml(fragment.toString()));
                    }
                    if (fragTexts.size() >= numFragments)
                        break;
                }
                summaries = fragTexts.toArray(new String[0]);
                if (summaries.length > 0)
                    docSummaries.add(fieldName, summaries);
            }
            // no summeries made, copy text from alternate field
            if (summaries == null || summaries.length == 0) {
                String alternateField = req.getParams().getFieldParam(fieldName,
                        HighlightParams.ALTERNATE_FIELD);
                if (alternateField != null && alternateField.length() > 0) {
                    String[] altTexts = doc.getValues(alternateField);
                    if (altTexts != null && altTexts.length > 0) {
                        int alternateFieldLen = req.getParams().getFieldInt(fieldName,
                                HighlightParams.ALTERNATE_FIELD_LENGTH, 0);
                        if (alternateFieldLen <= 0) {
                            docSummaries.add(fieldName, altTexts);
                        } else {
                            List<String> altList = new ArrayList<String>();
                            int len = 0;
                            for (String altText : altTexts) {
                                altList.add(len + altText.length() > alternateFieldLen
                                        ? altText.substring(0, alternateFieldLen - len)
                                        : altText);
                                len += altText.length();
                                if (len >= alternateFieldLen)
                                    break;
                            }
                            docSummaries.add(fieldName, altList);
                        }
                    }
                }
            }

        }
        String printId = schema.printableUniqueKey(doc);
        fragments.add(printId == null ? null : printId, docSummaries);
    }
    return fragments;
}

From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java

License:Apache License

private void assertNumericContents(int value, TokenStream ts) throws IOException {
    assertTrue(ts instanceof LegacyNumericTokenStream);
    LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class);
    ts.reset();
    boolean seen = false;
    while (ts.incrementToken()) {
        if (numericAtt.getShift() == 0) {
            assertEquals(value, numericAtt.getRawValue());
            seen = true;// www . j a  v  a 2 s.  co m
        }
    }
    ts.end();
    ts.close();
    assertTrue(seen);
}

From source file:org.apache.solr.schema.CollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because jdk collators might not be thread safe (when they are
 * its just that all methods are synced), this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *//*from   www.  j  av a  2  s  . c om*/
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable to analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.EntityTextField.java

License:Apache License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {/*from  w  ww .  j a  v  a  2  s .co  m*/
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.apache.solr.schema.ICUCollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because icu collators are not thread safe, this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *//*from   w w w  . ja v a  2 s  .c  o m*/
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.spelling.SimpleQueryConverter.java

License:Apache License

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<Token>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);

    TokenStream ts = null;
    try {/*from   www  . j ava 2s .  co  m*/
        ts = analyzer.tokenStream("", origQuery);
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        ts.reset();

        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.apache.solr.spelling.SpellingQueryConverter.java

License:Apache License

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue)
        throws IOException {
    TokenStream stream = analyzer.tokenStream("", text);
    // TODO: support custom attributes
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        Token token = new Token();
        token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
        token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset());
        token.setFlags(flagsAttValue); //overwriting any flags already set...
        token.setType(typeAtt.type());/*from   w ww  .  ja  va  2 s. c o  m*/
        token.setPayload(payloadAtt.getPayload());
        token.setPositionIncrement(posIncAtt.getPositionIncrement());
        result.add(token);
    }
    stream.end();
    stream.close();
}

From source file:org.apache.solr.TestTrie.java

License:Apache License

@Test
public void testTokenizer() throws Exception {
    FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
    assertTrue(type instanceof TrieField);

    String value = String.valueOf(random().nextInt());
    TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
    OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    int count = 0;
    while (ts.incrementToken()) {
        count++;//from  w w w. j av a 2 s .  c  om
        assertEquals(0, ofsAtt.startOffset());
        assertEquals(value.length(), ofsAtt.endOffset());
    }
    final int precStep = ((TrieField) type).getPrecisionStep();
    assertEquals((32 + precStep - 1) / precStep, count);
    ts.end();
    assertEquals(value.length(), ofsAtt.startOffset());
    assertEquals(value.length(), ofsAtt.endOffset());
    ts.close();

    // Test empty one:
    ts = type.getAnalyzer().tokenStream("dummy", "");
    ts.reset();
    assertFalse(ts.incrementToken());
    ts.end();
    assertEquals(0, ofsAtt.startOffset());
    assertEquals(0, ofsAtt.endOffset());
    ts.close();
}

From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java

License:Apache License

@Override
public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {//from w  w  w .j a v a  2s. c o m
    final FieldNameSelector srcSelector = getSourceSelector();
    return new UpdateRequestProcessor(next) {
        private final NLPNERTaggerOp nerTaggerOp;
        private Analyzer analyzer = null;
        {
            try {
                nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
                FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
                if (fieldType == null) {
                    throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType
                            + "' not found in the schema.");
                }
                analyzer = fieldType.getIndexAnalyzer();
            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        @Override
        public void processAdd(AddUpdateCommand cmd) throws IOException {

            final SolrInputDocument doc = cmd.getSolrInputDocument();

            // Destination may be regex replace string, or "{EntityType}" replaced by
            // each entity's type, both of which can cause multiple output fields.
            Map<String, SolrInputField> destMap = new HashMap<>();

            // preserve initial values
            for (final String fname : doc.getFieldNames()) {
                if (!srcSelector.shouldMutate(fname))
                    continue;

                Collection<Object> srcFieldValues = doc.getFieldValues(fname);
                if (srcFieldValues == null || srcFieldValues.isEmpty())
                    continue;

                String resolvedDest = dest;

                if (pattern != null) {
                    Matcher matcher = pattern.matcher(fname);
                    if (matcher.find()) {
                        resolvedDest = matcher.replaceAll(dest);
                    } else {
                        log.debug("srcSelector.shouldMutate(\"{}\") returned true, "
                                + "but replacement pattern did not match, field skipped.", fname);
                        continue;
                    }
                }

                for (Object val : srcFieldValues) {
                    for (Pair<String, String> entity : extractTypedNamedEntities(val)) {
                        SolrInputField destField = null;
                        String entityName = entity.first();
                        String entityType = entity.second();
                        resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType);
                        if (doc.containsKey(resolvedDest)) {
                            destField = doc.getField(resolvedDest);
                        } else {
                            SolrInputField targetField = destMap.get(resolvedDest);
                            if (targetField == null) {
                                destField = new SolrInputField(resolvedDest);
                            } else {
                                destField = targetField;
                            }
                        }
                        destField.addValue(entityName);

                        // put it in map to avoid concurrent modification...
                        destMap.put(resolvedDest, destField);
                    }
                }
            }

            for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
                doc.put(entry.getKey(), entry.getValue());
            }
            super.processAdd(cmd);
        }

        /** Using configured NER model, extracts (name, type) pairs from the given source field value */
        private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
            List<Pair<String, String>> entitiesWithType = new ArrayList<>();
            List<String> terms = new ArrayList<>();
            List<Integer> startOffsets = new ArrayList<>();
            List<Integer> endOffsets = new ArrayList<>();
            String fullText = srcFieldValue.toString();
            TokenStream tokenStream = analyzer.tokenStream("", fullText);
            CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
            FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
            tokenStream.reset();
            synchronized (nerTaggerOp) {
                while (tokenStream.incrementToken()) {
                    terms.add(termAtt.toString());
                    startOffsets.add(offsetAtt.startOffset());
                    endOffsets.add(offsetAtt.endOffset());
                    boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
                    if (endOfSentence) { // extract named entities one sentence at a time
                        extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets,
                                entitiesWithType);
                    }
                }
                tokenStream.end();
                tokenStream.close();
                if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
                    extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
                }
                nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
            }
            return entitiesWithType;
        }

        private void extractEntitiesFromSentence(String fullText, List<String> terms,
                List<Integer> startOffsets, List<Integer> endOffsets,
                List<Pair<String, String>> entitiesWithType) {
            for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
                String text = fullText.substring(startOffsets.get(span.getStart()),
                        endOffsets.get(span.getEnd() - 1));
                entitiesWithType.add(new Pair<>(text, span.getType()));
            }
            terms.clear();
            startOffsets.clear();
            endOffsets.clear();
        }
    };
}