Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.reusableTokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from ww w.j a v  a2  s  .c o m*/
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);/*from  w  w  w  .  ja  va  2s.  c  om*/
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

private static void validateTokens(String[] expected, TokenStream ts) throws IOException {
    int pos = 0;// w w  w. jav  a  2  s .co  m
    while (ts.incrementToken()) {
        assertTrue("Analyzer produced too many tokens", pos <= expected.length);
        CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
        assertEquals("Unexpected term", expected[pos++], termAttr.toString());
    }
    assertEquals("Analyzer produced too few terms", expected.length, pos);
}

From source file:org.apache.solr.analysis.TestTrimFilter.java

License:Apache License

public void testTrim() throws Exception {
    char[] a = " a ".toCharArray();
    char[] b = "b   ".toCharArray();
    char[] ccc = "cCc".toCharArray();
    char[] whitespace = "   ".toCharArray();
    char[] empty = "".toCharArray();
    TokenStream ts = new TrimFilter(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
            new Token(b, 0, b.length, 6, 10), new Token(ccc, 0, ccc.length, 11, 15),
            new Token(whitespace, 0, whitespace.length, 16, 20), new Token(empty, 0, empty.length, 21, 21)),
            false);/* w  w w  .  j a va  2  s. c  om*/

    TermAttribute token;
    assertTrue(ts.incrementToken());
    token = (TermAttribute) ts.getAttribute(TermAttribute.class);
    assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
    assertFalse(ts.incrementToken());

    a = " a".toCharArray();
    b = "b ".toCharArray();
    ccc = " c ".toCharArray();
    whitespace = "   ".toCharArray();
    ts = new TrimFilter(
            new IterTokenStream(new Token(a, 0, a.length, 0, 2), new Token(b, 0, b.length, 0, 2),
                    new Token(ccc, 0, ccc.length, 0, 3), new Token(whitespace, 0, whitespace.length, 0, 3)),
            true);

    List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3");
    List<Token> real = getTokens(ts);
    for (Token t : expect) {
        System.out.println("TEST:" + t);
    }
    for (Token t : real) {
        System.out.println("REAL:" + t);
    }
    assertTokEqualOff(expect, real);
}

From source file:org.apache.solr.analysis.TestWordDelimiterFilter.java

License:Apache License

private void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[],
        int posIncs[]) throws Exception {

    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
            .getAttribute(PositionIncrementAttribute.class);
    for (int i = 0; i < output.length; i++) {
        assertTrue(ts.incrementToken());
        assertEquals(output[i], termAtt.term());
        assertEquals(startOffsets[i], offsetAtt.startOffset());
        assertEquals(endOffsets[i], offsetAtt.endOffset());
        assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
    }//w  ww  .  j  a  v a  2 s  .co  m
    assertFalse(ts.incrementToken());
    ts.close();
}

From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java

License:Apache License

/**
 * Analyzes the given text using the given analyzer and returns the produced tokens.
 *
 * @param query    The query to analyze.
 * @param analyzer The analyzer to use./*w  ww. j ava2 s  . c  o  m*/
 */
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
    TokenStream tokenStream = null;
    try {
        tokenStream = analyzer.tokenStream("", query);
        final Set<BytesRef> tokens = new HashSet<BytesRef>();
        final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
        final BytesRef bytes = bytesAtt.getBytesRef();

        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            bytesAtt.fillBytesRef();
            tokens.add(BytesRef.deepCopyOf(bytes));
        }

        tokenStream.end();
        return tokens;
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }
}

From source file:org.apache.solr.handler.ClassifyStream.java

License:Apache License

@Override
public Tuple read() throws IOException {
    if (modelTuple == null) {

        modelTuple = modelStream.read();
        if (modelTuple == null || modelTuple.EOF) {
            throw new IOException("Model tuple not found for classify stream!");
        }//w  w w.  j  a v a2  s . c om

        termToIndex = new HashMap<>();

        List<String> terms = modelTuple.getStrings("terms_ss");

        for (int i = 0; i < terms.size(); i++) {
            termToIndex.put(terms.get(i), i);
        }

        idfs = modelTuple.getDoubles("idfs_ds");
        modelWeights = modelTuple.getDoubles("weights_ds");
    }

    Tuple docTuple = docStream.read();
    if (docTuple.EOF)
        return docTuple;

    String text = docTuple.getString(field);

    double tfs[] = new double[termToIndex.size()];

    TokenStream tokenStream = analyzer.tokenStream(analyzerField, text);
    CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();

    int termCount = 0;
    while (tokenStream.incrementToken()) {
        termCount++;
        if (termToIndex.containsKey(termAtt.toString())) {
            tfs[termToIndex.get(termAtt.toString())]++;
        }
    }

    tokenStream.end();
    tokenStream.close();

    List<Double> tfidfs = new ArrayList<>(termToIndex.size());
    tfidfs.add(1.0);
    for (int i = 0; i < tfs.length; i++) {
        if (tfs[i] != 0) {
            tfs[i] = 1 + Math.log(tfs[i]);
        }
        tfidfs.add(this.idfs.get(i) * tfs[i]);
    }

    double total = 0.0;
    for (int i = 0; i < tfidfs.size(); i++) {
        total += tfidfs.get(i) * modelWeights.get(i);
    }

    double score = total * ((float) (1.0 / Math.sqrt(termCount)));
    double positiveProb = sigmoid(total);

    docTuple.put("probability_d", positiveProb);
    docTuple.put("score_d", score);

    return docTuple;
}

From source file:org.apache.solr.handler.component.WordCloudComponent.java

License:Apache License

@Override
public void process(ResponseBuilder rb) throws IOException {
    SolrQueryRequest req = rb.req;/*  ww  w .ja va  2  s  .  c  o  m*/
    SolrParams params = req.getParams();
    if (!params.getBool(COMPONENT_NAME, true)) {
        return;
    }

    String wcFields = null;
    if ((wcFields = params.get("wordcloud.fl", null)) == null) {
        return;
    }

    Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields, ','));
    DocList ids = rb.getResults().docList;

    SolrIndexSearcher searcher = rb.req.getSearcher();
    IndexSchema schema = rb.req.getCore().getLatestSchema();

    final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer();
    final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();

    CharTermAttribute termAtt;
    Map<String, Map<String, Integer>> tokens = new HashMap<String, Map<String, Integer>>();

    for (String f : flds) {
        SchemaField field = schema.getFieldOrNull(f);
        if (field == null || !field.stored()) {
            continue; // ignore this field
        }
        fieldsToLoad.put(f, field.getType());
        tokens.put(f, new HashMap<String, Integer>());
    }

    DocIterator iterator = ids.iterator();
    String w;
    Integer v;
    int sz = ids.size();
    for (int i = 0; i < sz; i++) {
        int id = iterator.nextDoc();
        Document doc = searcher.doc(id, fieldsToLoad.keySet());
        for (Entry<String, FieldType> en : fieldsToLoad.entrySet()) {
            Map<String, Integer> toks = tokens.get(en.getKey());
            String[] vals = doc.getValues(en.getKey());
            FieldType fType = en.getValue();

            if (vals != null) {
                for (String s : vals) {
                    TokenStream buffer = analyzer.tokenStream(en.getKey(),
                            new StringReader(fType.indexedToReadable(s)));

                    if (!buffer.hasAttribute(CharTermAttribute.class)) {
                        continue; // empty stream
                    }

                    termAtt = buffer.getAttribute(CharTermAttribute.class);
                    buffer.reset();

                    while (buffer.incrementToken()) {
                        w = termAtt.toString();
                        v = toks.get(w);
                        if (v == null)
                            v = 0;
                        toks.put(w, ++v);
                    }

                    buffer.close();
                }
            }
        }
    }

    // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency)

    AtomicReader reader = searcher.getAtomicReader();
    BytesRef term;
    int df;
    String f;

    Map<String, Map<String, Double>> docFreqs = new HashMap<String, Map<String, Double>>();
    for (Entry<String, Map<String, Integer>> field : tokens.entrySet()) {
        HashMap<String, Double> idfs = new HashMap<String, Double>();
        f = field.getKey();
        docFreqs.put(f, idfs);
        int N = reader.getDocCount(f);

        for (Entry<String, Integer> token : field.getValue().entrySet()) {
            w = token.getKey();
            df = reader.docFreq(new Term(f, new BytesRef(w)));
            if (df != 0) {
                idfs.put(w, Math.log10(N / df));
            }
        }
    }

    HashMap<String, Object> ret = new HashMap<String, Object>();
    for (String fi : fieldsToLoad.keySet()) {
        HashMap<String, Object> va = new HashMap<String, Object>();
        va.put("tf", tokens.get(fi));
        va.put("idf", docFreqs.get(fi));
        ret.put(fi, va);
    }
    rb.rsp.add("wordcloud", ret);

}

From source file:org.apache.solr.highlight.GapFragmenter.java

License:Apache License

@Override
public void start(String originalText, TokenStream tokenStream) {
    offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
    posIncAtt = tokenStream.getAttribute(PositionIncrementAttribute.class);
    fragOffset = 0;/*  w ww. j a  va  2 s  .  c om*/
}

From source file:org.apache.solr.highlight.LuceneRegexFragmenter.java

License:Apache License

@Override
public void start(String originalText, TokenStream tokenStream) {
    currentNumFrags = 1;//from  ww w  .  j  a v a  2s.c o m
    currentOffset = 0;
    addHotSpots(originalText);
    posIncAtt = tokenStream.getAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
}