Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.apache.solr.handler.ClassifyStream.java

License:Apache License

@Override
public Tuple read() throws IOException {
    if (modelTuple == null) {

        modelTuple = modelStream.read();
        if (modelTuple == null || modelTuple.EOF) {
            throw new IOException("Model tuple not found for classify stream!");
        }/*  www . jav  a 2  s  . c o  m*/

        termToIndex = new HashMap<>();

        List<String> terms = modelTuple.getStrings("terms_ss");

        for (int i = 0; i < terms.size(); i++) {
            termToIndex.put(terms.get(i), i);
        }

        idfs = modelTuple.getDoubles("idfs_ds");
        modelWeights = modelTuple.getDoubles("weights_ds");
    }

    Tuple docTuple = docStream.read();
    if (docTuple.EOF)
        return docTuple;

    String text = docTuple.getString(field);

    double tfs[] = new double[termToIndex.size()];

    TokenStream tokenStream = analyzer.tokenStream(analyzerField, text);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();

    int termCount = 0;
    while (tokenStream.incrementToken()) {
        termCount++;
        if (termToIndex.containsKey(termAtt.toString())) {
            tfs[termToIndex.get(termAtt.toString())]++;
        }
    }

    tokenStream.end();
    tokenStream.close();

    List<Double> tfidfs = new ArrayList<>(termToIndex.size());
    tfidfs.add(1.0);
    for (int i = 0; i < tfs.length; i++) {
        if (tfs[i] != 0) {
            tfs[i] = 1 + Math.log(tfs[i]);
        }
        tfidfs.add(this.idfs.get(i) * tfs[i]);
    }

    double total = 0.0;
    for (int i = 0; i < tfidfs.size(); i++) {
        total += tfidfs.get(i) * modelWeights.get(i);
    }

    double score = total * ((float) (1.0 / Math.sqrt(termCount)));
    double positiveProb = sigmoid(total);

    docTuple.put("probability_d", positiveProb);
    docTuple.put("score_d", score);

    return docTuple;
}

From source file:org.apache.solr.handler.component.QueryElevationComponent.java

License:Apache License

String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }//from  w  ww.j  a  v a2  s  .c  om
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", query);
    try {
        tokens.reset();

        CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
        while (tokens.incrementToken()) {
            norm.append(termAtt.buffer(), 0, termAtt.length());
        }
        tokens.end();
        return norm.toString();
    } finally {
        IOUtils.closeWhileHandlingException(tokens);
    }
}

From source file:org.apache.solr.handler.component.SpellCheckComponent.java

License:Apache License

private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
    Collection<Token> result = new ArrayList<Token>();
    assert analyzer != null;
    TokenStream ts = analyzer.tokenStream("", q);
    try {//from w  ww  .  java 2 s .c  o  m
        ts.reset();
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        while (ts.incrementToken()) {
            Token token = new Token();
            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            token.setType(typeAtt.type());
            token.setFlags(flagsAtt.getFlags());
            token.setPayload(payloadAtt.getPayload());
            token.setPositionIncrement(posIncAtt.getPositionIncrement());
            result.add(token);
        }
        ts.end();
        return result;
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java

License:Apache License

private void assertNumericContents(int value, TokenStream ts) throws IOException {
    assertTrue(ts instanceof LegacyNumericTokenStream);
    LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class);
    ts.reset();//from   w  ww . j  ava2s  .c  o  m
    boolean seen = false;
    while (ts.incrementToken()) {
        if (numericAtt.getShift() == 0) {
            assertEquals(value, numericAtt.getRawValue());
            seen = true;
        }
    }
    ts.end();
    ts.close();
    assertTrue(seen);
}

From source file:org.apache.solr.schema.CollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because jdk collators might not be thread safe (when they are
 * its just that all methods are synced), this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *///from w ww. j  a va  2 s . c  o m
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable to analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.EntityTextField.java

License:Apache License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {//from w  w w  .j  av  a2 s .c o  m
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.apache.solr.schema.ICUCollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because icu collators are not thread safe, this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 */// ww  w  . j a  va2s.  com
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.spelling.SimpleQueryConverter.java

License:Apache License

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<Token>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);

    TokenStream ts = null;
    try {/*from www .  j ava2 s. c om*/
        ts = analyzer.tokenStream("", origQuery);
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        ts.reset();

        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.apache.solr.spelling.SpellingQueryConverter.java

License:Apache License

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue)
        throws IOException {
    TokenStream stream = analyzer.tokenStream("", text);
    // TODO: support custom attributes
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
    stream.reset();//w w  w  .ja va 2  s  . c o  m
    while (stream.incrementToken()) {
        Token token = new Token();
        token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
        token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset());
        token.setFlags(flagsAttValue); //overwriting any flags already set...
        token.setType(typeAtt.type());
        token.setPayload(payloadAtt.getPayload());
        token.setPositionIncrement(posIncAtt.getPositionIncrement());
        result.add(token);
    }
    stream.end();
    stream.close();
}

From source file:org.apache.solr.TestTrie.java

License:Apache License

@Test
public void testTokenizer() throws Exception {
    FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
    assertTrue(type instanceof TrieField);

    String value = String.valueOf(random().nextInt());
    TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
    OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
    ts.reset();/*from   w w w .  jav  a 2s.  com*/
    int count = 0;
    while (ts.incrementToken()) {
        count++;
        assertEquals(0, ofsAtt.startOffset());
        assertEquals(value.length(), ofsAtt.endOffset());
    }
    final int precStep = ((TrieField) type).getPrecisionStep();
    assertEquals((32 + precStep - 1) / precStep, count);
    ts.end();
    assertEquals(value.length(), ofsAtt.startOffset());
    assertEquals(value.length(), ofsAtt.endOffset());
    ts.close();

    // Test empty one:
    ts = type.getAnalyzer().tokenStream("dummy", "");
    ts.reset();
    assertFalse(ts.incrementToken());
    ts.end();
    assertEquals(0, ofsAtt.startOffset());
    assertEquals(0, ofsAtt.endOffset());
    ts.close();
}