Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.solr.handler.DumpIndexField.java

License:Apache License

private void runSynchronously(RequestQueue queue, SolrQueryRequest req)
        throws MalformedURLException, IOException, InterruptedException {

    SolrCore core = req.getCore();/* ww  w  .  j  a v  a  2 s  . c  o m*/
    IndexSchema schema = req.getSchema();

    RequestData data = queue.pop();

    if (!allowed.matcher(data.sourceField).matches()) {
        data.msg("Export of this field is not allowed: " + data.sourceField);
        queue.registerFailedBatch(data);
        return;
    }
    SchemaField field = core.getSchema().getFieldOrNull(data.sourceField);

    if (field == null || !field.stored()) {
        data.msg("We cannot dump fields that are not stored: " + data.sourceField);
        queue.registerFailedBatch(data);
        return;
    }

    final Analyzer analyzer = core.getSchema().getQueryAnalyzer();

    SchemaField targetField = core.getSchema().getFieldOrNull(data.targetField);

    if (targetField == null) {
        data.msg("We cannot find analyzer for: " + data.targetField);
        queue.registerFailedBatch(data);
        return;
    }

    final String targetAnalyzer = data.targetField;

    DirectoryReader ir = req.getSearcher().getIndexReader();
    SolrIndexSearcher se = req.getSearcher();

    final HashSet<String> fieldsToLoad = new HashSet<String>();
    fieldsToLoad.add(data.sourceField);

    se.search(new MatchAllDocsQuery(), new Collector() {
        private AtomicReader reader;
        private int i = 0;

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return true;
        }

        @Override
        public void collect(int i) {
            Document d;
            try {
                d = reader.document(i, fieldsToLoad);
                for (String f : fieldsToLoad) {
                    String[] vals = d.getValues(f);
                    for (String s : vals) {
                        TokenStream ts = analyzer.tokenStream(targetAnalyzer, new StringReader(s));
                        ts.reset();
                        while (ts.incrementToken()) {
                            //pass
                        }
                    }
                }
            } catch (IOException e) {
                // pass
            }
        }

        @Override
        public void setNextReader(AtomicReaderContext context) {
            this.reader = context.reader();
        }

        @Override
        public void setScorer(org.apache.lucene.search.Scorer scorer) {
            // Do Nothing
        }
    });

    // persist the data
    TokenStream ts = analyzer.tokenStream(data.targetField, new StringReader("xxx"));
    ts.reset();
    ts.reset();
    ts.reset();

}

From source file:org.apache.solr.highlight.HighlighterTest.java

License:Apache License

@Test
public void testTermOffsetsTokenStream() throws Exception {
    String[] multivalued = { "a b c d", "e f g", "h", "i j k l m n" };
    Analyzer a1 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
    TokenStream tokenStream = a1.tokenStream("", "a b c d e f g h i j k l m n");
    tokenStream.reset();//from w w w .  ja v a 2  s .com

    TermOffsetsTokenStream tots = new TermOffsetsTokenStream(tokenStream);
    for (String v : multivalued) {
        TokenStream ts1 = tots.getMultiValuedTokenStream(v.length());
        Analyzer a2 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
        TokenStream ts2 = a2.tokenStream("", v);
        ts2.reset();

        while (ts1.incrementToken()) {
            assertTrue(ts2.incrementToken());
            assertEquals(ts1, ts2);
        }
        assertFalse(ts2.incrementToken());
    }
}

From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java

License:Apache License

private void assertNumericContents(int value, TokenStream ts) throws IOException {
    assertTrue(ts instanceof LegacyNumericTokenStream);
    LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class);
    ts.reset();//from   www . jav  a2  s  . c o  m
    boolean seen = false;
    while (ts.incrementToken()) {
        if (numericAtt.getShift() == 0) {
            assertEquals(value, numericAtt.getRawValue());
            seen = true;
        }
    }
    ts.end();
    ts.close();
    assertTrue(seen);
}

From source file:org.apache.solr.schema.CollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because jdk collators might not be thread safe (when they are
 * its just that all methods are synced), this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *//* w  ww . j a va2  s .co  m*/
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable to analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.EntityTextField.java

License:Apache License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {//from w w  w.ja  v  a2  s .co m
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.apache.solr.schema.ICUCollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because icu collators are not thread safe, this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *//* www. ja  v a2 s .com*/
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.JsonPreAnalyzedParser.java

License:Apache License

@Override
public String toFormattedString(Field f) throws IOException {
    Map<String, Object> map = new LinkedHashMap<String, Object>();
    map.put(VERSION_KEY, VERSION);/*from   w w w  .ja v  a 2s  . c o  m*/
    if (f.fieldType().stored()) {
        String stringValue = f.stringValue();
        if (stringValue != null) {
            map.put(STRING_KEY, stringValue);
        }
        BytesRef binaryValue = f.binaryValue();
        if (binaryValue != null) {
            map.put(BINARY_KEY,
                    Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        List<Map<String, Object>> tokens = new LinkedList<Map<String, Object>>();
        while (ts.incrementToken()) {
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            Map<String, Object> tok = new TreeMap<String, Object>();
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                if (!ts.hasAttribute(cl)) {
                    continue;
                }
                Attribute att = ts.getAttribute(cl);
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = new String(catt.buffer(), 0, catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    tTerm = tatt.getBytesRef().utf8ToString();
                } else {
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
                        tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.put(TYPE_KEY, ((TypeAttribute) att).type());
                    } else {
                        tok.put(cl.getName(), att.toString());
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                tok.put(TOKEN_KEY, term);
            }
            tokens.add(tok);
        }
        map.put(TOKENS_KEY, tokens);
    }
    return JSONUtil.toJSON(map, -1);
}

From source file:org.apache.solr.schema.SimplePreAnalyzedParser.java

License:Apache License

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);/* w ww  .  j a v a2s  . co m*/
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                if (!ts.hasAttribute(cl)) {
                    continue;
                }
                Attribute att = ts.getAttribute(cl);
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e="
                                + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            tok.setLength(tok.length() - 1); // remove the last comma
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {

                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}

From source file:org.apache.solr.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * @param synonymAnalyzer/* www .j  a v  a2 s  . c om*/
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(getString()));

    SortedMap<Integer, SortedSet<TextInQuery>> startPosToTextsInQuery = new TreeMap<Integer, SortedSet<TextInQuery>>();

    try {
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                TextInQuery textInQuery = new TextInQuery(term.toString(), offsetAttribute.startOffset(),
                        offsetAttribute.endOffset());

                // brain-dead multimap logic... man, I wish we had Google Guava here
                SortedSet<TextInQuery> existingList = startPosToTextsInQuery.get(offsetAttribute.startOffset());
                if (existingList == null) {
                    existingList = new TreeSet<TextInQuery>();
                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), existingList);
                }
                existingList.add(textInQuery);
            }
        }

    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    }

    List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<List<TextInQuery>>(
            startPosToTextsInQuery.values().size());
    for (SortedSet<TextInQuery> sortedSet : startPosToTextsInQuery.values()) {
        sortedTextsInQuery.add(new ArrayList<TextInQuery>(sortedSet));
    }

    // have to use the start positions and end positions to figure out all possible combinations
    List<String> alternateQueries = buildUpAlternateQueries(sortedTextsInQuery);

    return createSynonymQueries(solrParams, alternateQueries);
}

From source file:org.apache.solr.spelling.SimpleQueryConverter.java

License:Apache License

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<Token>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);

    TokenStream ts = null;
    try {/*from w  ww  . j  a va 2 s  . c om*/
        ts = analyzer.tokenStream("", origQuery);
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        ts.reset();

        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}