Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.solr.analysis.TestTrimFilter.java

License:Apache License

public void testTrim() throws Exception {
    char[] a = " a ".toCharArray();
    char[] b = "b   ".toCharArray();
    char[] ccc = "cCc".toCharArray();
    char[] whitespace = "   ".toCharArray();
    char[] empty = "".toCharArray();
    TokenStream ts = new TrimFilter(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
            new Token(b, 0, b.length, 6, 10), new Token(ccc, 0, ccc.length, 11, 15),
            new Token(whitespace, 0, whitespace.length, 16, 20), new Token(empty, 0, empty.length, 21, 21)),
            false);//  w ww  . j  a v  a2  s.  co m

    TermAttribute token;
    assertTrue(ts.incrementToken());
    token = (TermAttribute) ts.getAttribute(TermAttribute.class);
    assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
    assertFalse(ts.incrementToken());

    a = " a".toCharArray();
    b = "b ".toCharArray();
    ccc = " c ".toCharArray();
    whitespace = "   ".toCharArray();
    ts = new TrimFilter(
            new IterTokenStream(new Token(a, 0, a.length, 0, 2), new Token(b, 0, b.length, 0, 2),
                    new Token(ccc, 0, ccc.length, 0, 3), new Token(whitespace, 0, whitespace.length, 0, 3)),
            true);

    List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3");
    List<Token> real = getTokens(ts);
    for (Token t : expect) {
        System.out.println("TEST:" + t);
    }
    for (Token t : real) {
        System.out.println("REAL:" + t);
    }
    assertTokEqualOff(expect, real);
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

private void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[],
        int posIncs[]) throws Exception {

    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
            .getAttribute(PositionIncrementAttribute.class);
    for (int i = 0; i < output.length; i++) {
        assertTrue(ts.incrementToken());
        assertEquals(output[i], termAtt.term());
        assertEquals(startOffsets[i], offsetAtt.startOffset());
        assertEquals(endOffsets[i], offsetAtt.endOffset());
        assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
    }/*from   www.  j  av  a  2 s. com*/
    assertFalse(ts.incrementToken());
    ts.close();
}

From source file:org.apache.solr.analysis.ThrowingMockTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream input) {
    return new TokenFilter(input) {
        @Override//from   w w  w . j  a  v a  2s.co m
        public boolean incrementToken() throws IOException {
            if (input.incrementToken()) {
                try {
                    throw exceptionClass.newInstance();
                } catch (IllegalAccessException iae) {
                    throw new RuntimeException(iae);
                } catch (InstantiationException ie) {
                    throw new RuntimeException(ie);
                }
            }
            return false;
        }
    };
}

From source file:org.apache.solr.handler.AnalysisRequestHandler.java

License:Apache License

static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException {
    // outer is namedList since order of tokens is important
    NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>();

    while (tstream.incrementToken()) {
        final NamedList<Object> token = new SimpleOrderedMap<Object>();
        tokens.add("token", token);
        tstream.reflectWith(new AttributeReflector() {
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
                String k = attClass.getName() + '#' + key;
                // map keys for "standard attributes":
                if (ATTRIBUTE_MAPPING.containsKey(k)) {
                    k = ATTRIBUTE_MAPPING.get(k);
                }//w  w w .j  av a  2s . c om
                token.add(k, value);
            }
        });
    }
    return tokens;
}

From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java

License:Apache License

/**
 * Analyzes the given text using the given analyzer and returns the produced tokens.
 *
 * @param query    The query to analyze.
 * @param analyzer The analyzer to use.//from  w w w.  j  a va 2  s .c o m
 */
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
    TokenStream tokenStream = null;
    try {
        tokenStream = analyzer.tokenStream("", query);
        final Set<BytesRef> tokens = new HashSet<BytesRef>();
        final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
        final BytesRef bytes = bytesAtt.getBytesRef();

        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            bytesAtt.fillBytesRef();
            tokens.add(BytesRef.deepCopyOf(bytes));
        }

        tokenStream.end();
        return tokens;
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }
}

From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java

License:Apache License

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */// w w  w  .  ja v  a  2s  . co  m
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    final List<AttributeSource> tokens = new ArrayList<AttributeSource>();
    final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    try {
        tokenStream.reset();
        int position = 0;
        while (tokenStream.incrementToken()) {
            position += posIncrAtt.getPositionIncrement();
            trackerAtt.setActPosition(position);
            tokens.add(tokenStream.cloneAttributes());
        }
        tokenStream.end();
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }

    return tokens;
}

From source file:org.apache.solr.handler.ClassifyStream.java

License:Apache License

@Override
public Tuple read() throws IOException {
    if (modelTuple == null) {

        modelTuple = modelStream.read();
        if (modelTuple == null || modelTuple.EOF) {
            throw new IOException("Model tuple not found for classify stream!");
        }/*from  ww w  .ja  va  2s. c o m*/

        termToIndex = new HashMap<>();

        List<String> terms = modelTuple.getStrings("terms_ss");

        for (int i = 0; i < terms.size(); i++) {
            termToIndex.put(terms.get(i), i);
        }

        idfs = modelTuple.getDoubles("idfs_ds");
        modelWeights = modelTuple.getDoubles("weights_ds");
    }

    Tuple docTuple = docStream.read();
    if (docTuple.EOF)
        return docTuple;

    String text = docTuple.getString(field);

    double tfs[] = new double[termToIndex.size()];

    TokenStream tokenStream = analyzer.tokenStream(analyzerField, text);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();

    int termCount = 0;
    while (tokenStream.incrementToken()) {
        termCount++;
        if (termToIndex.containsKey(termAtt.toString())) {
            tfs[termToIndex.get(termAtt.toString())]++;
        }
    }

    tokenStream.end();
    tokenStream.close();

    List<Double> tfidfs = new ArrayList<>(termToIndex.size());
    tfidfs.add(1.0);
    for (int i = 0; i < tfs.length; i++) {
        if (tfs[i] != 0) {
            tfs[i] = 1 + Math.log(tfs[i]);
        }
        tfidfs.add(this.idfs.get(i) * tfs[i]);
    }

    double total = 0.0;
    for (int i = 0; i < tfidfs.size(); i++) {
        total += tfidfs.get(i) * modelWeights.get(i);
    }

    double score = total * ((float) (1.0 / Math.sqrt(termCount)));
    double positiveProb = sigmoid(total);

    docTuple.put("probability_d", positiveProb);
    docTuple.put("score_d", score);

    return docTuple;
}

From source file:org.apache.solr.handler.component.QueryElevationComponent.java

License:Apache License

String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }//  w w w .j a v  a 2  s  .c om
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", query);
    try {
        tokens.reset();

        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            norm.append(termAtt.buffer(), 0, termAtt.length());
        }
        tokens.end();
        return norm.toString();
    } finally {
        IOUtils.closeWhileHandlingException(tokens);
    }
}

From source file:org.apache.solr.handler.component.SpellCheckComponent.java

License:Apache License

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<Token>();
    assert analyzer != null;
    TokenStream ts = analyzer.tokenStream("", q);
    try {//w w w . j a v a2 s .  c  o m
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.apache.solr.handler.component.WordCloudComponent.java

License:Apache License

@Override
public void process(ResponseBuilder rb) throws IOException {
    SolrQueryRequest req = rb.req;/*from w w w . j  a  v a2 s  .c  o m*/
    SolrParams params = req.getParams();
    if (!params.getBool(COMPONENT_NAME, true)) {
        return;
    }

    String wcFields = null;
    if ((wcFields = params.get("wordcloud.fl", null)) == null) {
        return;
    }

    Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ','));
    DocList ids = rb.getResults().docList;

    SolrIndexSearcher searcher = rb.req.getSearcher();
    IndexSchema schema = rb.req.getCore().getLatestSchema();

    final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer();
    final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();

    CharTermAttribute termAtt;
    Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>();

    for (String f : flds) {
        SchemaField field = schema.getFieldOrNull(f);
        if (field == null || !field.stored()) {
            continue; // ignore this field
        }
        fieldsToLoad.put(f, field.getType());
        tokens.put(f, new HashMap<String, Integer>());
    }

    DocIterator iterator = ids.iterator();
    String w;
    Integer v;
    int sz = ids.size();
    for (int i = 0; i < sz; i++) {
        int id = iterator.nextDoc();
        Document doc = searcher.doc(id, fieldsToLoad.keySet());
        for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) {
            Map<String, Integer> toks = tokens.get(en.getKey());
            String[] vals = doc.getValues(en.getKey());
            FieldType fType = en.getValue();

            if (vals != null) {
                for (String s : vals) {
                    TokenStream buffer = analyzer.tokenStream(en.getKey(),
                            new StringReader(fType.indexedToReadable(s)));

                    if (!buffer.hasAttribute(CharTermAttribute.class)) {
                        continue; // empty stream
                    }

                    termAtt = buffer.getAttribute(CharTermAttribute.class);
                    buffer.reset();

                    while (buffer.incrementToken()) {
                        w = termAtt.toString();
                        v = toks.get(w);
                        if (v == null)
                            v = 0;
                        toks.put(w, ++v);
                    }

                    buffer.close();
                }
            }
        }
    }

    // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency)

    AtomicReader reader = searcher.getAtomicReader();
    BytesRef term;
    int df;
    String f;

    Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>();
    for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) {
        HashMap<String, Double> idfs = new HashMap<String, Double>();
        f = field.getKey();
        docFreqs.put(f, idfs);
        int N = reader.getDocCount(f);

        for (Entry<String, Integer> token : field.getValue().entrySet()) {
            w = token.getKey();
            df = reader.docFreq(new Term(f, new BytesRef(w)));
            if (df != 0) {
                idfs.put(w, Math.log10(N / df));
            }
        }
    }

    HashMap<String, Object> ret = new HashMap<String, Object>();
    for (String fi : fieldsToLoad.keySet()) {
        HashMap<String, Object> va = new HashMap<String, Object>();
        va.put("tf", tokens.get(fi));
        va.put("idf", docFreqs.get(fi));
        ret.put(fi, va);
    }
    rb.rsp.add("wordcloud", ret);

}