Example usage for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.apache.solr.update.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java

License:Apache License

@Override
public final UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {/*from   w  w  w .  j av a 2  s.  c o m*/
    final FieldNameSelector srcSelector = getSourceSelector();
    return new UpdateRequestProcessor(next) {
        private final NLPNERTaggerOp nerTaggerOp;
        private Analyzer analyzer = null;
        {
            try {
                nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
                FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
                if (fieldType == null) {
                    throw new SolrException(SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType
                            + "' not found in the schema.");
                }
                analyzer = fieldType.getIndexAnalyzer();
            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        @Override
        public void processAdd(AddUpdateCommand cmd) throws IOException {

            final SolrInputDocument doc = cmd.getSolrInputDocument();

            // Destination may be regex replace string, or "{EntityType}" replaced by
            // each entity's type, both of which can cause multiple output fields.
            Map<String, SolrInputField> destMap = new HashMap<>();

            // preserve initial values
            for (final String fname : doc.getFieldNames()) {
                if (!srcSelector.shouldMutate(fname))
                    continue;

                Collection<Object> srcFieldValues = doc.getFieldValues(fname);
                if (srcFieldValues == null || srcFieldValues.isEmpty())
                    continue;

                String resolvedDest = dest;

                if (pattern != null) {
                    Matcher matcher = pattern.matcher(fname);
                    if (matcher.find()) {
                        resolvedDest = matcher.replaceAll(dest);
                    } else {
                        log.debug("srcSelector.shouldMutate(\"{}\") returned true, "
                                + "but replacement pattern did not match, field skipped.", fname);
                        continue;
                    }
                }

                for (Object val : srcFieldValues) {
                    for (Pair<String, String> entity : extractTypedNamedEntities(val)) {
                        SolrInputField destField = null;
                        String entityName = entity.first();
                        String entityType = entity.second();
                        resolvedDest = resolvedDest.replace(ENTITY_TYPE, entityType);
                        if (doc.containsKey(resolvedDest)) {
                            destField = doc.getField(resolvedDest);
                        } else {
                            SolrInputField targetField = destMap.get(resolvedDest);
                            if (targetField == null) {
                                destField = new SolrInputField(resolvedDest);
                            } else {
                                destField = targetField;
                            }
                        }
                        destField.addValue(entityName);

                        // put it in map to avoid concurrent modification...
                        destMap.put(resolvedDest, destField);
                    }
                }
            }

            for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
                doc.put(entry.getKey(), entry.getValue());
            }
            super.processAdd(cmd);
        }

        /** Using configured NER model, extracts (name, type) pairs from the given source field value */
        private List<Pair<String, String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
            List<Pair<String, String>> entitiesWithType = new ArrayList<>();
            List<String> terms = new ArrayList<>();
            List<Integer> startOffsets = new ArrayList<>();
            List<Integer> endOffsets = new ArrayList<>();
            String fullText = srcFieldValue.toString();
            TokenStream tokenStream = analyzer.tokenStream("", fullText);
            CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
            FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
            tokenStream.reset();
            synchronized (nerTaggerOp) {
                while (tokenStream.incrementToken()) {
                    terms.add(termAtt.toString());
                    startOffsets.add(offsetAtt.startOffset());
                    endOffsets.add(offsetAtt.endOffset());
                    boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
                    if (endOfSentence) { // extract named entities one sentence at a time
                        extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets,
                                entitiesWithType);
                    }
                }
                tokenStream.end();
                tokenStream.close();
                if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
                    extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
                }
                nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
            }
            return entitiesWithType;
        }

        private void extractEntitiesFromSentence(String fullText, List<String> terms,
                List<Integer> startOffsets, List<Integer> endOffsets,
                List<Pair<String, String>> entitiesWithType) {
            for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
                String text = fullText.substring(startOffsets.get(span.getStart()),
                        endOffsets.get(span.getEnd() - 1));
                entitiesWithType.add(new Pair<>(text, span.getType()));
            }
            terms.clear();
            startOffsets.clear();
            endOffsets.clear();
        }
    };
}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testGeneral() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer general = analyzerManager.getGeneralAnalyzer();
    TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
    ts.reset();//from w w  w .ja v a2  s .c  o  m

    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("the"));
    assertTrue(seen.contains("and"));
    assertTrue(seen.contains("dog"));

}

From source file:org.apache.tika.eval.AnalyzerManagerTest.java

License:Apache License

@Test
public void testCommon() throws Exception {
    AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
    Analyzer common = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
    ts.reset();/*from   ww w  . j a v  a2s.  co  m*/
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    Set<String> seen = new HashSet<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) {
            fail("Shouldn't have found a numeric");
        }
        seen.add(termAtt.toString());
    }
    ts.end();
    ts.close();

    assertTrue(seen.contains("dirty"));
    assertFalse(seen.contains("the"));

}

From source file:org.apache.tika.eval.tokens.TokenCounter.java

License:Apache License

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;

    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();/* w  ww  .  j  ava  2  s . c o  m*/
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();

    int totalUniqueTokens = tokenMap.size();

    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();

        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }

    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }

    /*            Collections.sort(allTokens);
    List<TokenIntPair> topNList = new ArrayList<>(topN);
    for (int i = 0; i < topN && i < allTokens.size(); i++) {
        topNList.add(allTokens.get(i));
    }*/

    tokenStatistics.put(field,
            new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));

}

From source file:org.apache.tika.eval.tokens.TokenCounterTest.java

License:Apache License

@Test
public void testCJKFilter() throws Exception {
    String s = "then quickbrownfoxjumpedoverthelazy dogss dog ";
    Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
    TokenStream ts = analyzer.tokenStream(FIELD, s);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//w w w  .j  a va2  s .c  o  m
    Map<String, Integer> tokens = new HashMap<>();
    while (ts.incrementToken()) {
        String t = termAtt.toString();
        Integer count = tokens.get(t);
        count = (count == null) ? count = 0 : count;
        count++;
        tokens.put(t, count);
    }
    ts.end();
    ts.close();
    assertEquals(7, tokens.size());
    assertEquals(new Integer(1), tokens.get(""));
}

From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java

License:Open Source License

private static String analyzeTextToFind(String textToFind) throws IOException {
    // Filter textToFind through GreekAnalyzer
    TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind));
    stream.reset();//from   ww  w.  jav a 2s . c om

    StringBuilder analyzedTextTofind = new StringBuilder();

    try {
        while (stream.incrementToken()) {

            String term = stream.getAttribute(TermAttribute.class).term();

            analyzedTextTofind.append(term);
            analyzedTextTofind.append(" ");

        }
    } catch (IOException e) {
        e.printStackTrace();

        analyzedTextTofind.append(textToFind);
    } finally {
        stream.end();
        stream.close();

    }

    String result = analyzedTextTofind.toString().trim();

    if (StringUtils.isBlank(result))
        return textToFind;

    return result;

}

From source file:org.chombo.util.BasicUtils.java

License:Apache License

/**
 * Analyzes text and return analyzed text
 * @param text/*from www.  j  a  v  a2  s .  co  m*/
 * @return
 * @throws IOException
 */
public static String analyze(String text, Analyzer analyzer) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    StringBuilder stBld = new StringBuilder();

    stream.reset();
    CharTermAttribute termAttribute = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        String token = termAttribute.toString();
        stBld.append(token).append(" ");
    }
    stream.end();
    stream.close();
    return stBld.toString();
}

From source file:org.cosmo.common.util.WordUtil.java

License:Apache License

public static void main(String[] args) throws Exception {

    StringReader reader = new StringReader(
            "CNN, CNN news, CNN.com, CNN TV, news, news online, breaking news, U.S. news, world news, weather, business, CNN Money, sports, politics, law, technology, entertainment, education, travel, health, special reports, autos, developing story, news video, CNN Intl");
    /*/*  w  w  w  .  j  av a2 s.  com*/
    LetterTokenizer tokenizer = new LetterTokenizer(reader);
    AttributeSource filter = new StopFilter(true, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
            
    while (filter.hasAttributes()) {
       Attribute attribute = filter.captureState().
       System.out.println(attribute);
    }
    */
    StopAnalyzer analyzer = new StopAnalyzer(Index.Version);
    Set<String> uniqueTerms = new HashSet();
    TokenStream tokenStream = analyzer.reusableTokenStream("anyting", reader);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        TermAttribute term = tokenStream.getAttribute(TermAttribute.class);
        uniqueTerms.add(term.term());
    }
    tokenStream.end();
    tokenStream.close();

    System.out.println(Arrays.toString(uniqueTerms.toArray()));

}

From source file:org.dbpedia.spotlight.lucene.analysis.NGramAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String myString = "cancer";
    Analyzer analyzer = new NGramAnalyzer(3, 3);
    System.out.println("Analyzing: \"" + myString + "\"");
    StringReader reader = new StringReader(myString);
    TokenStream stream = analyzer.tokenStream("field", reader);
    //        TokenStream stream = new NGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 1,2);
    stream.reset();//from w  ww. jav  a 2 s . co m

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println("token: " + stream);
    }

    stream.end();
    stream.close();
}

From source file:org.dbpedia.spotlight.lucene.analysis.PhoneticAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String myString = "cancer";
    Analyzer analyzer = new PhoneticAnalyzer(Version.LUCENE_36, SpotlightConfiguration.DEFAULT_STOPWORDS);
    System.out.println("Analyzing: \"" + myString + "\"");
    StringReader reader = new StringReader(myString);
    TokenStream stream = analyzer.tokenStream("field", reader);
    stream.reset();//from w ww . j a v a  2 s. c  o m

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println("token: " + stream);
    }

    stream.end();
    stream.close();
}