Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.solr.spelling.SpellingQueryConverter.java

License:Apache License

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue)
        throws IOException {
    TokenStream stream = analyzer.tokenStream("", text);
    // TODO: support custom attributes
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
    stream.reset();//from www  . j a  v  a  2s  .  c  om
    while (stream.incrementToken()) {
        Token token = new Token();
        token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
        token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset());
        token.setFlags(flagsAttValue); //overwriting any flags already set...
        token.setType(typeAtt.type());
        token.setPayload(payloadAtt.getPayload());
        token.setPositionIncrement(posIncAtt.getPositionIncrement());
        result.add(token);
    }
    stream.end();
    stream.close();
}

From source file:org.apache.solr.TestTrie.java

License:Apache License

@Test
public void testTokenizer() throws Exception {
    FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
    assertTrue(type instanceof TrieField);

    String value = String.valueOf(random().nextInt());
    TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
    OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
    ts.reset();//from   w w w .  ja v a2s. c  o m
    int count = 0;
    while (ts.incrementToken()) {
        count++;
        assertEquals(0, ofsAtt.startOffset());
        assertEquals(value.length(), ofsAtt.endOffset());
    }
    final int precStep = ((TrieField) type).getPrecisionStep();
    assertEquals((32 + precStep - 1) / precStep, count);
    ts.end();
    assertEquals(value.length(), ofsAtt.startOffset());
    assertEquals(value.length(), ofsAtt.endOffset());
    ts.close();

    // Test empty one:
    ts = type.getAnalyzer().tokenStream("dummy", "");
    ts.reset();
    assertFalse(ts.incrementToken());
    ts.end();
    assertEquals(0, ofsAtt.startOffset());
    assertEquals(0, ofsAtt.endOffset());
    ts.close();
}

From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java

License:Apache License

@Override
public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {/*from   w w  w .ja  va 2 s  .  co m*/
    final FieldNameSelector srcSelector = getSourceSelector();
    return new UpdateRequestProcessor(next) {
        private final NLPNERTaggerOp nerTaggerOp;
        private Analyzer analyzer = null;
        {
            try {
                nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
                FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
                if (fieldType == null) {
                    throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType
                            + "' not found in the schema.");
                }
                analyzer = fieldType.getIndexAnalyzer();
            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        @Override
        public void processAdd(AddUpdateCommand cmd) throws IOException {

            final SolrInputDocument doc = cmd.getSolrInputDocument();

            // Destination may be regex replace string, or "{EntityType}" replaced by
            // each entity's type, both of which can cause multiple output fields.
            Map<String, SolrInputField> destMap = new HashMap<>();

            // preserve initial values
            for (final String fname : doc.getFieldNames()) {
                if (!srcSelector.shouldMutate(fname))
                    continue;

                Collection<Object> srcFieldValues = doc.getFieldValues(fname);
                if (srcFieldValues == null || srcFieldValues.isEmpty())
                    continue;

                String resolvedDest = dest;

                if (pattern != null) {
                    Matcher matcher = pattern.matcher(fname);
                    if (matcher.find()) {
                        resolvedDest = matcher.replaceAll(dest);
                    } else {
                        log.debug("srcSelector.shouldMutate(\"{}\") returned true, "
                                + "but replacement pattern did not match, field skipped.", fname);
                        continue;
                    }
                }

                for (Object val : srcFieldValues) {
                    for (Pair<String, String> entity : extractTypedNamedEntities(val)) {
                        SolrInputField destField = null;
                        String entityName = entity.first();
                        String entityType = entity.second();
                        resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType);
                        if (doc.containsKey(resolvedDest)) {
                            destField = doc.getField(resolvedDest);
                        } else {
                            SolrInputField targetField = destMap.get(resolvedDest);
                            if (targetField == null) {
                                destField = new SolrInputField(resolvedDest);
                            } else {
                                destField = targetField;
                            }
                        }
                        destField.addValue(entityName);

                        // put it in map to avoid concurrent modification...
                        destMap.put(resolvedDest, destField);
                    }
                }
            }

            for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
                doc.put(entry.getKey(), entry.getValue());
            }
            super.processAdd(cmd);
        }

        /** Using configured NER model, extracts (name, type) pairs from the given source field value */
        private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
            List<Pair<String, String>> entitiesWithType = new ArrayList<>();
            List<String> terms = new ArrayList<>();
            List<Integer> startOffsets = new ArrayList<>();
            List<Integer> endOffsets = new ArrayList<>();
            String fullText = srcFieldValue.toString();
            TokenStream tokenStream = analyzer.tokenStream("", fullText);
            CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
            FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
            tokenStream.reset();
            synchronized (nerTaggerOp) {
                while (tokenStream.incrementToken()) {
                    terms.add(termAtt.toString());
                    startOffsets.add(offsetAtt.startOffset());
                    endOffsets.add(offsetAtt.endOffset());
                    boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
                    if (endOfSentence) { // extract named entities one sentence at a time
                        extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets,
                                entitiesWithType);
                    }
                }
                tokenStream.end();
                tokenStream.close();
                if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
                    extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
                }
                nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
            }
            return entitiesWithType;
        }

        private void extractEntitiesFromSentence(String fullText, List<String> terms,
                List<Integer> startOffsets, List<Integer> endOffsets,
                List<Pair<String, String>> entitiesWithType) {
            for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
                String text = fullText.substring(startOffsets.get(span.getStart()),
                        endOffsets.get(span.getEnd() - 1));
                entitiesWithType.add(new Pair<>(text, span.getType()));
            }
            terms.clear();
            startOffsets.clear();
            endOffsets.clear();
        }
    };
}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testGeneral() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer general = analyzerManager.getGeneralAnalyzer();
    TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
    ts.reset();/*www.jav a  2 s  .  c  om*/

    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("the"));
    assertTrue(seen.contains("and"));
    assertTrue(seen.contains("dog"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testCommon() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer common = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
    ts.reset();/*w  w  w .  j  ava 2 s  .  co m*/
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
            fail("Shouldn't have found a numeric");
        }
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("dirty"));
    assertFalse(seen.contains("the"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testTokenCountFilter() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 101000; i++) {
        sb.append("the ");
    }/*from w  w w .  ja v  a2s  .  com*/
    TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
    ts.reset();
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
    }

    assertEquals(100000, tokens);

}

From source file:org.apache.tika.eval.tokens.TokenCounter.java

License:Apache License

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;

    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from  w  w w .j  a va  2  s  .  c  o  m
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();

    int totalUniqueTokens = tokenMap.size();

    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();

        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }

    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }

    /*            Collections.sort(allTokens);
    List<TokenIntPair> topNList = new ArrayList<>(topN);
    for (int i = 0; i < topN && i < allTokens.size(); i++) {
        topNList.add(allTokens.get(i));
    }*/

    tokenStatistics.put(field,
            new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));

}

From source file:org.apache.tika.eval.tokens.TokenCounterTest.java

License:Apache License

@Test
public void testCJKFilter() throws Exception {
    String s = "then quickbrownfoxjumpedoverthelazy dogss dog ";
    Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = analyzer.tokenStream(FIELD, s);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from w w w  .java 2s .c  o  m
    Map<String, Integer> tokens = new HashMap<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        Integer count = tokens.get(t);
        count = (count == null) ? count = 0 : count;
        count++;
        tokens.put(t, count);
    }
    ts.end();
    ts.close();
    assertEquals(7, tokens.size());
    assertEquals(new Integer(1), tokens.get(""));
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java

License:Apache License

private void init() throws IOException {
    for (TokenStream stream : streams) {
        stream.reset();//w ww.ja  v a 2s  .c o  m
        stream.incrementToken();
        sortedStreams.add(stream);
    }
    rebuildSortedTokens();
    initialized = true;
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java

License:Apache License

@Override
public boolean incrementToken() throws IOException {
    if (!initialized)
        init();//from  ww w.j av a2  s  . c  o  m

    if (sortedStreams.size() == 0)
        return false;

    TokenStream currentTokenStream = sortedStreams.pop();

    restoreState(currentTokenStream.captureState());

    OffsetAttribute offsetAttr = (OffsetAttribute) currentTokenStream.getAttribute(OffsetAttribute.class);
    if (offsetAttr.startOffset() == currentOffset)
        posIncAtt.setPositionIncrement(0);
    else
        posIncAtt.setPositionIncrement(1);

    currentOffset = offsetAttr.startOffset();

    // proceed the token stream to its next token and resort the stack
    if (currentTokenStream.incrementToken())
        sortedStreams.add(currentTokenStream);
    rebuildSortedTokens();

    return true;
}