Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:org.apache.jackrabbit.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///  w  w w . j a va  2s  .co  m
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        TermAttribute term = ts.getAttribute(TermAttribute.class);
        String word = term.term();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java

License:Apache License

/**
 * Merges the fulltext indexed fields of the aggregated node states into
 * <code>doc</code>.// ww w  .j  a v  a2  s  . com
 *
 * @param state the node state on which <code>doc</code> was created.
 * @param doc the lucene document with index fields from <code>state</code>.
 * @param ifv the current index format version.
 */
protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) {
    if (indexingConfig != null) {
        AggregateRule[] aggregateRules = indexingConfig.getAggregateRules();
        if (aggregateRules == null) {
            return;
        }
        try {
            ItemStateManager ism = getContext().getItemStateManager();
            for (AggregateRule aggregateRule : aggregateRules) {
                boolean ruleMatched = false;
                // node includes
                NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state);
                if (aggregates != null) {
                    ruleMatched = true;
                    for (NodeState aggregate : aggregates) {
                        Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv);
                        // transfer fields to doc if there are any
                        Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
                        if (fulltextFields != null) {
                            for (Fieldable fulltextField : fulltextFields) {
                                doc.add(fulltextField);
                            }
                            doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                    aggregate.getNodeId().toString(), Field.Store.NO,
                                    Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                        }
                    }
                    // make sure that fulltext fields are aligned properly
                    // first all stored fields, then remaining
                    Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT);
                    doc.removeFields(FieldNames.FULLTEXT);
                    Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED);
                    for (Fieldable f : fulltextFields) {
                        doc.add(f);
                    }
                }
                // property includes
                PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state);
                if (propStates != null) {
                    ruleMatched = true;
                    for (PropertyState propState : propStates) {
                        String namePrefix = FieldNames.createNamedValue(
                                getNamespaceMappings().translateName(propState.getName()), "");
                        NodeState parent = (NodeState) ism.getItemState(propState.getParentId());
                        Document aDoc = createDocument(parent, getNamespaceMappings(), ifv);
                        try {
                            // find the right fields to transfer
                            Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
                            for (Fieldable field : fields) {

                                // assume properties fields use SingleTokenStream
                                TokenStream tokenStream = field.tokenStreamValue();
                                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                                PayloadAttribute payloadAttribute = tokenStream
                                        .addAttribute(PayloadAttribute.class);
                                tokenStream.incrementToken();
                                tokenStream.end();
                                tokenStream.close();

                                String value = new String(termAttribute.termBuffer(), 0,
                                        termAttribute.termLength());
                                if (value.startsWith(namePrefix)) {
                                    // extract value
                                    String rawValue = value.substring(namePrefix.length());
                                    // create new named value
                                    Path p = getRelativePath(state, propState);
                                    String path = getNamespaceMappings().translatePath(p);
                                    value = FieldNames.createNamedValue(path, rawValue);
                                    termAttribute.setTermBuffer(value);
                                    PropertyMetaData pdm = PropertyMetaData
                                            .fromByteArray(payloadAttribute.getPayload().getData());
                                    doc.add(new Field(field.name(),
                                            new SingletonTokenStream(value, pdm.getPropertyType())));
                                    doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                            parent.getNodeId().toString(), Field.Store.NO,
                                            Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                                    if (pdm.getPropertyType() == PropertyType.STRING) {
                                        // add to fulltext index
                                        Field ft = new Field(FieldNames.FULLTEXT, false, rawValue,
                                                Field.Store.YES, Field.Index.ANALYZED_NO_NORMS,
                                                Field.TermVector.NO);
                                        doc.add(ft);
                                    }
                                }
                            }
                        } finally {
                            Util.disposeDocument(aDoc);
                        }
                    }
                }

                // only use first aggregate definition that matches
                if (ruleMatched) {
                    break;
                }
            }
        } catch (NoSuchItemStateException e) {
            // do not fail if aggregate cannot be created
            log.info("Exception while building indexing aggregate for {}. Node is not available {}.",
                    state.getNodeId(), e.getMessage());
        } catch (Exception e) {
            // do not fail if aggregate cannot be created
            log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e);
        }
    }
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java

License:Apache License

/**
 * Tries to merge back tokens that are split on relevant fulltext query
 * wildcards ('*' or '?')//from  www .j a  v a  2 s. c om
 *
 *
 * @param text
 * @param analyzer
 * @return
 */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        stream.reset();

        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return tokens;
}

From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java

License:Apache License

@Test
public void testAnalysis() throws Exception {
    Analyzer analyzer = new MailArchivesClusteringAnalyzer();

    String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
            + "Mahout is a scalable, machine-learning LIBRARY\n"
            + "we've added some additional stopwords such as html, mailto, regards\t"
            + "apache_hadoop provides the foundation for scalability\n"
            + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package";
    Reader reader = new StringReader(text);

    // if you change the text above, then you may need to change this as well
    // order matters too
    String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
            "stopword", "apache_hadoop", "provid", "foundat", "scalabl" };

    TokenStream tokenStream = analyzer.tokenStream("test", reader);
    assertNotNull(tokenStream);/*from  w w  w  .  j a  v a 2s. c  om*/
    tokenStream.reset();
    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    int e = 0;
    while (tokenStream.incrementToken() && e < expectedTokens.length) {
        assertEquals(expectedTokens[e++], termAtt.toString());
    }
    assertEquals(e, expectedTokens.length);
    tokenStream.end();
    tokenStream.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** normal case, unfiltered analyzer */
@Test/*  w w  w. j a v  a 2  s.  c o  m*/
public void testAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    validateTokens(allTokens, ts);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** filtered analyzer */
@Test//from  w  ww  .ja  va 2s.  co  m
public void testNonKeepdAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts);
    validateTokens(expectedNonKeepTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** keep analyzer */
@Test//from w w w .  j a va  2 s. c  o m
public void testKeepAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts);
    validateTokens(expectedKeepTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** shingles, keep those matching whitelist */
@Test//from  w  w w . j  a v a  2 s  .  co m
public void testShingleFilteredAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    ShingleFilter sf = new ShingleFilter(ts, 3);
    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf);
    validateTokens(expectedShingleTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.maven.index.DefaultQueryCreator.java

License:Apache License

protected int countTerms(final IndexerField indexerField, final String query) {
    try {//from  w  w w.j  a  v  a  2s.c  om
        TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query));
        ts.reset();

        int result = 0;

        while (ts.incrementToken()) {
            result++;
        }

        ts.end();
        ts.close();

        return result;
    } catch (IOException e) {
        // will not happen
        return 1;
    }
}

From source file:org.apache.nutch.summary.basic.BasicSummarizer.java

License:Apache License

private Token[] getTokens(String text) {
    ArrayList result = new ArrayList();
    TokenStream ts = analyzer.tokenStream("content", new StringReader(text));
    Token token = null;//w w  w . j a  va2 s.  c o  m
    while (result.size() < token_deep) {
        try {
            token = ts.next();
        } catch (IOException e) {
            token = null;
        }
        if (token == null) {
            break;
        }
        result.add(token);
    }
    try {
        ts.close();
    } catch (IOException e) {
        // ignore
    }
    return (Token[]) result.toArray(new Token[result.size()]);
}