Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.solr.handler.DumpIndexField.java

License:Apache License

private void runSynchronously(RequestQueue queue, SolrQueryRequest req)
        throws MalformedURLException, IOException, InterruptedException {

    SolrCore core = req.getCore();/* ww  w  .  j  a v  a  2 s  . c  o m*/
    IndexSchema schema = req.getSchema();

    RequestData data = queue.pop();

    if (!allowed.matcher(data.sourceField).matches()) {
        data.msg("Export of this field is not allowed: " + data.sourceField);
        queue.registerFailedBatch(data);
        return;
    }
    SchemaField field = core.getSchema().getFieldOrNull(data.sourceField);

    if (field == null || !field.stored()) {
        data.msg("We cannot dump fields that are not stored: " + data.sourceField);
        queue.registerFailedBatch(data);
        return;
    }

    final Analyzer analyzer = core.getSchema().getQueryAnalyzer();

    SchemaField targetField = core.getSchema().getFieldOrNull(data.targetField);

    if (targetField == null) {
        data.msg("We cannot find analyzer for: " + data.targetField);
        queue.registerFailedBatch(data);
        return;
    }

    final String targetAnalyzer = data.targetField;

    DirectoryReader ir = req.getSearcher().getIndexReader();
    SolrIndexSearcher se = req.getSearcher();

    final HashSet<String> fieldsToLoad = new HashSet<String>();
    fieldsToLoad.add(data.sourceField);

    se.search(new MatchAllDocsQuery(), new Collector() {
        private AtomicReader reader;
        private int i = 0;

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return true;
        }

        @Override
        public void collect(int i) {
            Document d;
            try {
                d = reader.document(i, fieldsToLoad);
                for (String f : fieldsToLoad) {
                    String[] vals = d.getValues(f);
                    for (String s : vals) {
                        TokenStream ts = analyzer.tokenStream(targetAnalyzer, new StringReader(s));
                        ts.reset();
                        while (ts.incrementToken()) {
                            //pass
                        }
                    }
                }
            } catch (IOException e) {
                // pass
            }
        }

        @Override
        public void setNextReader(AtomicReaderContext context) {
            this.reader = context.reader();
        }

        @Override
        public void setScorer(org.apache.lucene.search.Scorer scorer) {
            // Do Nothing
        }
    });

    // persist the data
    TokenStream ts = analyzer.tokenStream(data.targetField, new StringReader("xxx"));
    ts.reset();
    ts.reset();
    ts.reset();

}

From source file:org.apache.solr.highlight.HighlighterTest.java

License:Apache License

@Test
public void testTermOffsetsTokenStream() throws Exception {
    String[] multivalued = { "a b c d", "e f g", "h", "i j k l m n" };
    Analyzer a1 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
    TokenStream tokenStream = a1.tokenStream("", "a b c d e f g h i j k l m n");
    tokenStream.reset();//from w w w .  ja v a 2  s .com

    TermOffsetsTokenStream tots = new TermOffsetsTokenStream(tokenStream);
    for (String v : multivalued) {
        TokenStream ts1 = tots.getMultiValuedTokenStream(v.length());
        Analyzer a2 = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
        TokenStream ts2 = a2.tokenStream("", v);
        ts2.reset();

        while (ts1.incrementToken()) {
            assertTrue(ts2.incrementToken());
            assertEquals(ts1, ts2);
        }
        assertFalse(ts2.incrementToken());
    }
}

From source file:org.apache.solr.legacy.TestLegacyFieldReuse.java

License:Apache License

private void assertNumericContents(int value, TokenStream ts) throws IOException {
    assertTrue(ts instanceof LegacyNumericTokenStream);
    LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class);
    ts.reset();//from   www . jav  a2  s  . c o  m
    boolean seen = false;
    while (ts.incrementToken()) {
        if (numericAtt.getShift() == 0) {
            assertEquals(value, numericAtt.getRawValue());
            seen = true;
        }
    }
    ts.end();
    ts.close();
    assertTrue(seen);
}

From source file:org.apache.solr.schema.CollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because jdk collators might not be thread safe (when they are
 * its just that all methods are synced), this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *//* w  ww . j a va2  s .co  m*/
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable to analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.EntityTextField.java

License:Apache License

public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try {//from w w  w.ja  v  a2  s .co m
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.apache.solr.schema.ICUCollationField.java

License:Apache License

/**
 * analyze the range with the analyzer, instead of the collator.
 * because icu collators are not thread safe, this keeps things 
 * simple (we already have a threadlocal clone in the reused TS)
 *//* www. ja  v a2 s .com*/
private BytesRef analyzeRangePart(String field, String part) {
    TokenStream source = null;
    try {
        source = analyzer.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
        termAtt.fillBytesRef();
        assert !source.incrementToken();

        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Unable analyze range part: " + part, e);
    } finally {
        IOUtils.closeQuietly(source);
    }
}

From source file:org.apache.solr.schema.JsonPreAnalyzedParser.java

License:Apache License

@Override
public String toFormattedString(Field f) throws IOException {
    Map<String, Object> map = new LinkedHashMap<String, Object>();
    map.put(VERSION_KEY, VERSION);/*from   w w w  .ja v  a 2s  . c o  m*/
    if (f.fieldType().stored()) {
        String stringValue = f.stringValue();
        if (stringValue != null) {
            map.put(STRING_KEY, stringValue);
        }
        BytesRef binaryValue = f.binaryValue();
        if (binaryValue != null) {
            map.put(BINARY_KEY,
                    Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        List<Map<String, Object>> tokens = new LinkedList<Map<String, Object>>();
        while (ts.incrementToken()) {
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            Map<String, Object> tok = new TreeMap<String, Object>();
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                if (!ts.hasAttribute(cl)) {
                    continue;
                }
                Attribute att = ts.getAttribute(cl);
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = new String(catt.buffer(), 0, catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    tTerm = tatt.getBytesRef().utf8ToString();
                } else {
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
                        tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.put(TYPE_KEY, ((TypeAttribute) att).type());
                    } else {
                        tok.put(cl.getName(), att.toString());
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                tok.put(TOKEN_KEY, term);
            }
            tokens.add(tok);
        }
        map.put(TOKENS_KEY, tokens);
    }
    return JSONUtil.toJSON(map, -1);
}

From source file:org.apache.solr.schema.SimplePreAnalyzedParser.java

License:Apache License

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);/* w ww  .  j a v a2s  . co m*/
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                if (!ts.hasAttribute(cl)) {
                    continue;
                }
                Attribute att = ts.getAttribute(cl);
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e="
                                + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            tok.setLength(tok.length() - 1); // remove the last comma
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {

                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}

From source file:org.apache.solr.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * @param synonymAnalyzer/* www .j  a v  a2 s  . c om*/
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(getString()));

    SortedMap<Integer, SortedSet<TextInQuery>> startPosToTextsInQuery = new TreeMap<Integer, SortedSet<TextInQuery>>();

    try {
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                TextInQuery textInQuery = new TextInQuery(term.toString(), offsetAttribute.startOffset(),
                        offsetAttribute.endOffset());

                // brain-dead multimap logic... man, I wish we had Google Guava here
                SortedSet<TextInQuery> existingList = startPosToTextsInQuery.get(offsetAttribute.startOffset());
                if (existingList == null) {
                    existingList = new TreeSet<TextInQuery>();
                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), existingList);
                }
                existingList.add(textInQuery);
            }
        }

    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    }

    List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<List<TextInQuery>>(
            startPosToTextsInQuery.values().size());
    for (SortedSet<TextInQuery> sortedSet : startPosToTextsInQuery.values()) {
        sortedTextsInQuery.add(new ArrayList<TextInQuery>(sortedSet));
    }

    // have to use the start positions and end positions to figure out all possible combinations
    List<String> alternateQueries = buildUpAlternateQueries(sortedTextsInQuery);

    return createSynonymQueries(solrParams, alternateQueries);
}

From source file:org.apache.solr.spelling.SimpleQueryConverter.java

License:Apache License

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<Token>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);

    TokenStream ts = null;
    try {/*from w  ww  . j  a va 2 s  . c om*/
        ts = analyzer.tokenStream("", origQuery);
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

        ts.reset();

        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}