Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.apache.solr.analysis.TestBufferedTokenStream.java

License:Apache License

public void testReset() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
    TokenStream ts = new AB_AAB_Stream(tokenizer);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    assertTrue(ts.incrementToken());// w  ww  .ja v a  2s.  c  o m
    assertEquals("How", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("now", term.toString());
    assertTrue(ts.incrementToken());
    assertEquals("A", term.toString());
    // reset back to input, 
    // if reset() does not work correctly then previous buffered tokens will remain 
    tokenizer.reset(new StringReader(input));
    ts.reset();
    assertTrue(ts.incrementToken());
    assertEquals("How", term.toString());
}

From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java

License:Apache License

/**
 * Analyzes the given text using the given analyzer and returns the produced tokens.
 *
 * @param query    The query to analyze.
 * @param analyzer The analyzer to use./*from  w  w  w . j a  v a2 s .c  o m*/
 */
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
    TokenStream tokenStream = null;
    try {
        tokenStream = analyzer.tokenStream("", query);
        final Set<BytesRef> tokens = new HashSet<BytesRef>();
        final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
        final BytesRef bytes = bytesAtt.getBytesRef();

        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            bytesAtt.fillBytesRef();
            tokens.add(BytesRef.deepCopyOf(bytes));
        }

        tokenStream.end();
        return tokens;
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }
}

From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java

License:Apache License

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 *///from   w w  w.j  av a  2  s .c om
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    final List<AttributeSource> tokens = new ArrayList<AttributeSource>();
    final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    try {
        tokenStream.reset();
        int position = 0;
        while (tokenStream.incrementToken()) {
            position += posIncrAtt.getPositionIncrement();
            trackerAtt.setActPosition(position);
            tokens.add(tokenStream.cloneAttributes());
        }
        tokenStream.end();
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }

    return tokens;
}

From source file:org.apache.solr.handler.ClassifyStream.java

License:Apache License

@Override
public Tuple read() throws IOException {
    if (modelTuple == null) {

        modelTuple = modelStream.read();
        if (modelTuple == null || modelTuple.EOF) {
            throw new IOException("Model tuple not found for classify stream!");
        }//  w w  w  . j a va  2  s .c o  m

        termToIndex = new HashMap<>();

        List<String> terms = modelTuple.getStrings("terms_ss");

        for (int i = 0; i < terms.size(); i++) {
            termToIndex.put(terms.get(i), i);
        }

        idfs = modelTuple.getDoubles("idfs_ds");
        modelWeights = modelTuple.getDoubles("weights_ds");
    }

    Tuple docTuple = docStream.read();
    if (docTuple.EOF)
        return docTuple;

    String text = docTuple.getString(field);

    double tfs[] = new double[termToIndex.size()];

    TokenStream tokenStream = analyzer.tokenStream(analyzerField, text);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();

    int termCount = 0;
    while (tokenStream.incrementToken()) {
        termCount++;
        if (termToIndex.containsKey(termAtt.toString())) {
            tfs[termToIndex.get(termAtt.toString())]++;
        }
    }

    tokenStream.end();
    tokenStream.close();

    List<Double> tfidfs = new ArrayList<>(termToIndex.size());
    tfidfs.add(1.0);
    for (int i = 0; i < tfs.length; i++) {
        if (tfs[i] != 0) {
            tfs[i] = 1 + Math.log(tfs[i]);
        }
        tfidfs.add(this.idfs.get(i) * tfs[i]);
    }

    double total = 0.0;
    for (int i = 0; i < tfidfs.size(); i++) {
        total += tfidfs.get(i) * modelWeights.get(i);
    }

    double score = total * ((float) (1.0 / Math.sqrt(termCount)));
    double positiveProb = sigmoid(total);

    docTuple.put("probability_d", positiveProb);
    docTuple.put("score_d", score);

    return docTuple;
}

From source file:org.apache.solr.handler.component.QueryElevationComponent.java

License:Apache License

String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }/*from   www . j  ava2 s .c  o  m*/
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", query);
    try {
        tokens.reset();

        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            norm.append(termAtt.buffer(), 0, termAtt.length());
        }
        tokens.end();
        return norm.toString();
    } finally {
        IOUtils.closeWhileHandlingException(tokens);
    }
}

From source file:org.apache.solr.handler.component.SpellCheckComponent.java

License:Apache License

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<Token>();
    assert analyzer != null;
    TokenStream ts = analyzer.tokenStream("", q);
    try {//  w  w w.j a v a  2 s.  c  o  m
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.apache.solr.handler.component.WordCloudComponent.java

License:Apache License

@Override
public void process(ResponseBuilder rb) throws IOException {
    SolrQueryRequest req = rb.req;/*from   w  w  w .j av a 2 s  .com*/
    SolrParams params = req.getParams();
    if (!params.getBool(COMPONENT_NAME, true)) {
        return;
    }

    String wcFields = null;
    if ((wcFields = params.get("wordcloud.fl", null)) == null) {
        return;
    }

    Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ','));
    DocList ids = rb.getResults().docList;

    SolrIndexSearcher searcher = rb.req.getSearcher();
    IndexSchema schema = rb.req.getCore().getLatestSchema();

    final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer();
    final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();

    CharTermAttribute termAtt;
    Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>();

    for (String f : flds) {
        SchemaField field = schema.getFieldOrNull(f);
        if (field == null || !field.stored()) {
            continue; // ignore this field
        }
        fieldsToLoad.put(f, field.getType());
        tokens.put(f, new HashMap<String, Integer>());
    }

    DocIterator iterator = ids.iterator();
    String w;
    Integer v;
    int sz = ids.size();
    for (int i = 0; i < sz; i++) {
        int id = iterator.nextDoc();
        Document doc = searcher.doc(id, fieldsToLoad.keySet());
        for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) {
            Map<String, Integer> toks = tokens.get(en.getKey());
            String[] vals = doc.getValues(en.getKey());
            FieldType fType = en.getValue();

            if (vals != null) {
                for (String s : vals) {
                    TokenStream buffer = analyzer.tokenStream(en.getKey(),
                            new StringReader(fType.indexedToReadable(s)));

                    if (!buffer.hasAttribute(CharTermAttribute.class)) {
                        continue; // empty stream
                    }

                    termAtt = buffer.getAttribute(CharTermAttribute.class);
                    buffer.reset();

                    while (buffer.incrementToken()) {
                        w = termAtt.toString();
                        v = toks.get(w);
                        if (v == null)
                            v = 0;
                        toks.put(w, ++v);
                    }

                    buffer.close();
                }
            }
        }
    }

    // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency)

    AtomicReader reader = searcher.getAtomicReader();
    BytesRef term;
    int df;
    String f;

    Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>();
    for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) {
        HashMap<String, Double> idfs = new HashMap<String, Double>();
        f = field.getKey();
        docFreqs.put(f, idfs);
        int N = reader.getDocCount(f);

        for (Entry<String, Integer> token : field.getValue().entrySet()) {
            w = token.getKey();
            df = reader.docFreq(new Term(f, new BytesRef(w)));
            if (df != 0) {
                idfs.put(w, Math.log10(N / df));
            }
        }
    }

    HashMap<String, Object> ret = new HashMap<String, Object>();
    for (String fi : fieldsToLoad.keySet()) {
        HashMap<String, Object> va = new HashMap<String, Object>();
        va.put("tf", tokens.get(fi));
        va.put("idf", docFreqs.get(fi));
        ret.put(fi, va);
    }
    rb.rsp.add("wordcloud", ret);

}

From source file:org.apache.solr.handler.DumpIndexField.java

License:Apache License

private void runSynchronously(RequestQueue queue, SolrQueryRequest req)
        throws MalformedURLException, IOException, InterruptedException {

    SolrCore core = req.getCore();//from  w ww.  java2 s .c  om
    IndexSchema schema = req.getSchema();

    RequestData data = queue.pop();

    if (!allowed.matcher(data.sourceField).matches()) {
        data.msg("Export of this field is not allowed: " + data.sourceField);
        queue.registerFailedBatch(data);
        return;
    }
    SchemaField field = core.getSchema().getFieldOrNull(data.sourceField);

    if (field == null || !field.stored()) {
        data.msg("We cannot dump fields that are not stored: " + data.sourceField);
        queue.registerFailedBatch(data);
        return;
    }

    final Analyzer analyzer = core.getSchema().getQueryAnalyzer();

    SchemaField targetField = core.getSchema().getFieldOrNull(data.targetField);

    if (targetField == null) {
        data.msg("We cannot find analyzer for: " + data.targetField);
        queue.registerFailedBatch(data);
        return;
    }

    final String targetAnalyzer = data.targetField;

    DirectoryReader ir = req.getSearcher().getIndexReader();
    SolrIndexSearcher se = req.getSearcher();

    final HashSet<String> fieldsToLoad = new HashSet<String>();
    fieldsToLoad.add(data.sourceField);

    se.search(new MatchAllDocsQuery(), new Collector() {
        private AtomicReader reader;
        private int i = 0;

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return true;
        }

        @Override
        public void collect(int i) {
            Document d;
            try {
                d = reader.document(i, fieldsToLoad);
                for (String f : fieldsToLoad) {
                    String[] vals = d.getValues(f);
                    for (String s : vals) {
                        TokenStream ts = analyzer.tokenStream(targetAnalyzer, new StringReader(s));
                        ts.reset();
                        while (ts.incrementToken()) {
                            //pass
                        }
                    }
                }
            } catch (IOException e) {
                // pass
            }
        }

        @Override
        public void setNextReader(AtomicReaderContext context) {
            this.reader = context.reader();
        }

        @Override
        public void setScorer(org.apache.lucene.search.Scorer scorer) {
            // Do Nothing
        }
    });

    // persist the data
    TokenStream ts = analyzer.tokenStream(data.targetField, new StringReader("xxx"));
    ts.reset();
    ts.reset();
    ts.reset();

}

From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java

License:Apache License

private void doHighlightingByHighlighter(Query query, SolrQueryRequest req, NamedList docSummaries, int docId,
        Document doc, String fieldName) throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField)))
        return;//from www  . j  ava 2s  .  c om
    // END: Hack

    SolrParams params = req.getParams();

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    List<IndexableField> allFields = doc.getFields();
    if (allFields != null && allFields.size() == 0)
        return; // No explicit contract that getFields returns != null,
    // although currently it can't.

    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
    if (tvStream != null) {
        tots = new TermOffsetsTokenStream(tvStream);
    }
    int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
            Integer.toString(Integer.MAX_VALUE)));
    int mvToMatch = Integer.parseInt(
            req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.toString(Integer.MAX_VALUE)));

    for (IndexableField thisField : allFields) {
        if (mvToExamine <= 0 || mvToMatch <= 0)
            break;

        if (!thisField.name().equals(fieldName))
            continue; // Is there a better way to do this?

        --mvToExamine;
        String thisText = thisField.stringValue();

        if (tots != null) {
            // if we're using TermOffsets optimization, then get the next
            // field value's TokenStream (i.e. get field j's TokenStream) from tots:
            tstream = tots.getMultiValuedTokenStream(thisText.length());
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schema, fieldName, thisText);
        }

        int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS,
                Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

        Highlighter highlighter;
        if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
            if (maxCharsToAnalyze < 0) {
                tstream = new CachingTokenFilter(tstream);
            } else {
                tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
            }

            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

            // after highlighter initialization, reset tstream since construction of highlighter already used it
            tstream.reset();
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }

        if (maxCharsToAnalyze < 0) {
            highlighter.setMaxDocCharsToAnalyze(thisText.length());
        } else {
            highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        }

        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText,
                    mergeContiguousFragments, numFragments);
            for (int k = 0; k < bestTextFragments.length; k++) {
                if (preserveMulti) {
                    if (bestTextFragments[k] != null) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                } else {
                    if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
                        frags.add(bestTextFragments[k]);
                        --mvToMatch;
                    }
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // sort such that the fragments with the highest score come first
    if (!preserveMulti) {
        Collections.sort(frags, new Comparator<TextFragment>() {
            @Override
            public int compare(TextFragment arg0, TextFragment arg1) {
                return Math.round(arg1.getScore() - arg0.getScore());
            }
        });
    }

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
        ArrayList<String> fragTexts = new ArrayList<String>();
        for (TextFragment fragment : frags) {
            if (preserveMulti) {
                if (fragment != null) {
                    fragTexts.add(fragment.toString());
                }
            } else {
                if ((fragment != null) && (fragment.getScore() > 0)) {
                    fragTexts.add(fragment.toString());
                }
            }

            if (fragTexts.size() >= numFragments && !preserveMulti)
                break;
        }
        summaries = fragTexts.toArray(new String[0]);
        if (summaries.length > 0)
            docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
        alternateField(docSummaries, params, doc, fieldName);
    }
}

From source file:org.apache.solr.highlight.DefaultSolrHighlighter.java

License:Apache License

private TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText)
        throws IOException {

    TokenStream tstream;//from   w w  w .ja v  a2s .  c  o m
    TokenStream ts = schema.getAnalyzer().tokenStream(fieldName, docText);
    ts.reset();
    tstream = new TokenOrderingFilter(ts, 10);
    return tstream;
}