Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java

License:Apache License

private void assertNumericContents(int value, TokenStream ts) throws IOException {
    assertTrue(ts instanceof LegacyNumericTokenStream);
    LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class);
    ts.reset();/*from  w ww  .  j  a v  a2  s  .c om*/
    boolean seen = false;
    while (ts.incrementToken()) {
        if (numericAtt.getShift() == 0) {
            assertEquals(value, numericAtt.getRawValue());
            seen = true;
        }
    }
    ts.end();
    ts.close();
    assertTrue(seen);
}

From source file:org.apache.solr.schema.CollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because jdk collators might not be thread safe (when they are
 * its just that all methods are synced), this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *///from ww  w  . j  ava 2 s  . c o  m
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable to analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.EntityTextField.java

License:Apache License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {//from ww w  .  jav  a2  s  .  com
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.apache.solr.schema.ICUCollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because icu collators are not thread safe, this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *//*ww w  .j  a v a2  s  . c o m*/
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.JsonPreAnalyzedParser.java

License:Apache License

@Override
public String toFormattedString(Field f) throws IOException {
    Map<String, Object> map = new LinkedHashMap<String, Object>();
    map.put(VERSION_KEY, VERSION);//from w ww .  jav a2s.  c  o m
    if (f.fieldType().stored()) {
        String stringValue = f.stringValue();
        if (stringValue != null) {
            map.put(STRING_KEY, stringValue);
        }
        BytesRef binaryValue = f.binaryValue();
        if (binaryValue != null) {
            map.put(BINARY_KEY,
                    Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        List<Map<String, Object>> tokens = new LinkedList<Map<String, Object>>();
        while (ts.incrementToken()) {
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            Map<String, Object> tok = new TreeMap<String, Object>();
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                if (!ts.hasAttribute(cl)) {
                    continue;
                }
                Attribute att = ts.getAttribute(cl);
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = new String(catt.buffer(), 0, catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    tTerm = tatt.getBytesRef().utf8ToString();
                } else {
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
                        tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.put(TYPE_KEY, ((TypeAttribute) att).type());
                    } else {
                        tok.put(cl.getName(), att.toString());
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                tok.put(TOKEN_KEY, term);
            }
            tokens.add(tok);
        }
        map.put(TOKENS_KEY, tokens);
    }
    return JSONUtil.toJSON(map, -1);
}

From source file:org.apache.solr.schema.SimplePreAnalyzedParser.java

License:Apache License

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);//  www  .  j  ava  2  s.co m
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                if (!ts.hasAttribute(cl)) {
                    continue;
                }
                Attribute att = ts.getAttribute(cl);
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e="
                                + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            tok.setLength(tok.length() - 1); // remove the last comma
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {

                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}

From source file:org.apache.solr.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * @param synonymAnalyzer//from  w  ww . ja v a2  s  . c o  m
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(getString()));

    SortedMap<Integer, SortedSet<TextInQuery>> startPosToTextsInQuery = new TreeMap<Integer, SortedSet<TextInQuery>>();

    try {
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                TextInQuery textInQuery = new TextInQuery(term.toString(), offsetAttribute.startOffset(),
                        offsetAttribute.endOffset());

                // brain-dead multimap logic... man, I wish we had Google Guava here
                SortedSet<TextInQuery> existingList = startPosToTextsInQuery.get(offsetAttribute.startOffset());
                if (existingList == null) {
                    existingList = new TreeSet<TextInQuery>();
                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), existingList);
                }
                existingList.add(textInQuery);
            }
        }

    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    }

    List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<List<TextInQuery>>(
            startPosToTextsInQuery.values().size());
    for (SortedSet<TextInQuery> sortedSet : startPosToTextsInQuery.values()) {
        sortedTextsInQuery.add(new ArrayList<TextInQuery>(sortedSet));
    }

    // have to use the start positions and end positions to figure out all possible combinations
    List<String> alternateQueries = buildUpAlternateQueries(sortedTextsInQuery);

    return createSynonymQueries(solrParams, alternateQueries);
}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testGeneral() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer general = analyzerManager.getGeneralAnalyzer();
    TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
    ts.reset();//  www.  j  a  v  a2s .c o  m

    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("the"));
    assertTrue(seen.contains("and"));
    assertTrue(seen.contains("dog"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testCommon() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer common = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
    ts.reset();/* www. j  a  v a  2 s  . co m*/
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
            fail("Shouldn't have found a numeric");
        }
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("dirty"));
    assertFalse(seen.contains("the"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testTokenCountFilter() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 101000; i++) {
        sb.append("the ");
    }/*  www.  j a va  2 s .c  o m*/
    TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
    ts.reset();
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
    }

    assertEquals(100000, tokens);

}