Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.solr.spelling.SpellingQueryConverter.java

License:Apache License

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue)
        throws IOException {
    TokenStream stream = analyzer.tokenStream("", text);
    // TODO: support custom attributes
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
    stream.reset();//from www  . j a  v  a  2s  .  c  om
    while (stream.incrementToken()) {
        Token token = new Token();
        token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
        token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset());
        token.setFlags(flagsAttValue); //overwriting any flags already set...
        token.setType(typeAtt.type());
        token.setPayload(payloadAtt.getPayload());
        token.setPositionIncrement(posIncAtt.getPositionIncrement());
        result.add(token);
    }
    stream.end();
    stream.close();
}

From source file:org.apache.solr.TestTrie.java

License:Apache License

@Test
public void testTokenizer() throws Exception {
    FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
    assertTrue(type instanceof TrieField);

    String value = String.valueOf(random().nextInt());
    TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
    OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
    ts.reset();//from   w w w .  ja v a2s. c  o m
    int count = 0;
    while (ts.incrementToken()) {
        count++;
        assertEquals(0, ofsAtt.startOffset());
        assertEquals(value.length(), ofsAtt.endOffset());
    }
    final int precStep = ((TrieField) type).getPrecisionStep();
    assertEquals((32 + precStep - 1) / precStep, count);
    ts.end();
    assertEquals(value.length(), ofsAtt.startOffset());
    assertEquals(value.length(), ofsAtt.endOffset());
    ts.close();

    // Test empty one:
    ts = type.getAnalyzer().tokenStream("dummy", "");
    ts.reset();
    assertFalse(ts.incrementToken());
    ts.end();
    assertEquals(0, ofsAtt.startOffset());
    assertEquals(0, ofsAtt.endOffset());
    ts.close();
}

From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java

License:Apache License

@Override
public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {/*from   w w  w .ja  va 2 s  .  co m*/
    final FieldNameSelector srcSelector = getSourceSelector();
    return new UpdateRequestProcessor(next) {
        private final NLPNERTaggerOp nerTaggerOp;
        private Analyzer analyzer = null;
        {
            try {
                nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
                FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
                if (fieldType == null) {
                    throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType
                            + "' not found in the schema.");
                }
                analyzer = fieldType.getIndexAnalyzer();
            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        @Override
        public void processAdd(AddUpdateCommand cmd) throws IOException {

            final SolrInputDocument doc = cmd.getSolrInputDocument();

            // Destination may be regex replace string, or "{EntityType}" replaced by
            // each entity's type, both of which can cause multiple output fields.
            Map<String, SolrInputField> destMap = new HashMap<>();

            // preserve initial values
            for (final String fname : doc.getFieldNames()) {
                if (!srcSelector.shouldMutate(fname))
                    continue;

                Collection<Object> srcFieldValues = doc.getFieldValues(fname);
                if (srcFieldValues == null || srcFieldValues.isEmpty())
                    continue;

                String resolvedDest = dest;

                if (pattern != null) {
                    Matcher matcher = pattern.matcher(fname);
                    if (matcher.find()) {
                        resolvedDest = matcher.replaceAll(dest);
                    } else {
                        log.debug("srcSelector.shouldMutate(\"{}\") returned true, "
                                + "but replacement pattern did not match, field skipped.", fname);
                        continue;
                    }
                }

                for (Object val : srcFieldValues) {
                    for (Pair<String, String> entity : extractTypedNamedEntities(val)) {
                        SolrInputField destField = null;
                        String entityName = entity.first();
                        String entityType = entity.second();
                        resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType);
                        if (doc.containsKey(resolvedDest)) {
                            destField = doc.getField(resolvedDest);
                        } else {
                            SolrInputField targetField = destMap.get(resolvedDest);
                            if (targetField == null) {
                                destField = new SolrInputField(resolvedDest);
                            } else {
                                destField = targetField;
                            }
                        }
                        destField.addValue(entityName);

                        // put it in map to avoid concurrent modification...
                        destMap.put(resolvedDest, destField);
                    }
                }
            }

            for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
                doc.put(entry.getKey(), entry.getValue());
            }
            super.processAdd(cmd);
        }

        /** Using configured NER model, extracts (name, type) pairs from the given source field value */
        private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
            List<Pair<String, String>> entitiesWithType = new ArrayList<>();
            List<String> terms = new ArrayList<>();
            List<Integer> startOffsets = new ArrayList<>();
            List<Integer> endOffsets = new ArrayList<>();
            String fullText = srcFieldValue.toString();
            TokenStream tokenStream = analyzer.tokenStream("", fullText);
            CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
            FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
            tokenStream.reset();
            synchronized (nerTaggerOp) {
                while (tokenStream.incrementToken()) {
                    terms.add(termAtt.toString());
                    startOffsets.add(offsetAtt.startOffset());
                    endOffsets.add(offsetAtt.endOffset());
                    boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
                    if (endOfSentence) { // extract named entities one sentence at a time
                        extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets,
                                entitiesWithType);
                    }
                }
                tokenStream.end();
                tokenStream.close();
                if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
                    extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
                }
                nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
            }
            return entitiesWithType;
        }

        private void extractEntitiesFromSentence(String fullText, List<String> terms,
                List<Integer> startOffsets, List<Integer> endOffsets,
                List<Pair<String, String>> entitiesWithType) {
            for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
                String text = fullText.substring(startOffsets.get(span.getStart()),
                        endOffsets.get(span.getEnd() - 1));
                entitiesWithType.add(new Pair<>(text, span.getType()));
            }
            terms.clear();
            startOffsets.clear();
            endOffsets.clear();
        }
    };
}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testGeneral() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer general = analyzerManager.getGeneralAnalyzer();
    TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
    ts.reset();/*www.jav a  2 s  .  c  om*/

    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("the"));
    assertTrue(seen.contains("and"));
    assertTrue(seen.contains("dog"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testCommon() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer common = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
    ts.reset();/*w  w  w .  j  ava 2 s  .  co m*/
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
            fail("Shouldn't have found a numeric");
        }
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("dirty"));
    assertFalse(seen.contains("the"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testTokenCountFilter() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 101000; i++) {
        sb.append("the ");
    }/*from w  w w .  ja v  a2s  .  com*/
    TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
    ts.reset();
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    int tokens = 0;
    while (ts.incrementToken()) {
        tokens++;
    }

    assertEquals(100000, tokens);

}

From source file:org.apache.tika.eval.tokens.TokenCounter.java

License:Apache License

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;

    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from  w  w w .j  a va  2  s  .  c  o  m
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();

    int totalUniqueTokens = tokenMap.size();

    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();

        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }

    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }

    /*            Collections.sort(allTokens);
    List<TokenIntPair> topNList = new ArrayList<>(topN);
    for (int i = 0; i < topN && i < allTokens.size(); i++) {
        topNList.add(allTokens.get(i));
    }*/

    tokenStatistics.put(field,
            new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));

}

From source file:org.apache.tika.eval.tokens.TokenCounterTest.java

License:Apache License

@Test
public void testCJKFilter() throws Exception {
    String s = "then quickbrownfoxjumpedoverthelazy dogss dog ";
    Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = analyzer.tokenStream(FIELD, s);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from w w w  .java 2s .c  o  m
    Map<String, Integer> tokens = new HashMap<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        Integer count = tokens.get(t);
        count = (count == null) ? count = 0 : count;
        count++;
        tokens.put(t, count);
    }
    ts.end();
    ts.close();
    assertEquals(7, tokens.size());
    assertEquals(new Integer(1), tokens.get(""));
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java

License:Apache License

private void init() throws IOException {
    for (TokenStream stream : streams) {
        stream.reset();//w ww.ja  v a 2s  .c o  m
        stream.incrementToken();
        sortedStreams.add(stream);
    }
    rebuildSortedTokens();
    initialized = true;
}

From source file:org.apache.uima.lucas.indexer.analysis.TokenStreamMerger.java

License:Apache License

@Override
public boolean incrementToken() throws IOException {
    if (!initialized)
        init();//from  ww w.j av a2  s  . c  o  m

    if (sortedStreams.size() == 0)
        return false;

    TokenStream currentTokenStream = sortedStreams.pop();

    restoreState(currentTokenStream.captureState());

    OffsetAttribute offsetAttr = (OffsetAttribute) currentTokenStream.getAttribute(OffsetAttribute.class);
    if (offsetAttr.startOffset() == currentOffset)
        posIncAtt.setPositionIncrement(0);
    else
        posIncAtt.setPositionIncrement(1);

    currentOffset = offsetAttr.startOffset();

    // proceed the token stream to its next token and resort the stack
    if (currentTokenStream.incrementToken())
        sortedStreams.add(currentTokenStream);
    rebuildSortedTokens();

    return true;
}