List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:org.apache.jackrabbit.core.query.lucene.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */// w w w . j a va 2s .co m private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token while (ts.incrementToken()) { TermAttribute term = ts.getAttribute(TermAttribute.class); String word = term.term(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java
License:Apache License
/** * Merges the fulltext indexed fields of the aggregated node states into * <code>doc</code>.// ww w .j a v a2 s . com * * @param state the node state on which <code>doc</code> was created. * @param doc the lucene document with index fields from <code>state</code>. * @param ifv the current index format version. */ protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) { if (indexingConfig != null) { AggregateRule[] aggregateRules = indexingConfig.getAggregateRules(); if (aggregateRules == null) { return; } try { ItemStateManager ism = getContext().getItemStateManager(); for (AggregateRule aggregateRule : aggregateRules) { boolean ruleMatched = false; // node includes NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state); if (aggregates != null) { ruleMatched = true; for (NodeState aggregate : aggregates) { Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv); // transfer fields to doc if there are any Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT); if (fulltextFields != null) { for (Fieldable fulltextField : fulltextFields) { doc.add(fulltextField); } doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, aggregate.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); } } // make sure that fulltext fields are aligned properly // first all stored fields, then remaining Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT); doc.removeFields(FieldNames.FULLTEXT); Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED); for (Fieldable f : fulltextFields) { doc.add(f); } } // property includes PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state); if (propStates != null) { ruleMatched = true; for (PropertyState propState : propStates) { String namePrefix = FieldNames.createNamedValue( getNamespaceMappings().translateName(propState.getName()), ""); NodeState parent = (NodeState) ism.getItemState(propState.getParentId()); Document aDoc = createDocument(parent, getNamespaceMappings(), ifv); try { // find the right fields to transfer Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES); for (Fieldable field : fields) { // assume properties fields use SingleTokenStream TokenStream tokenStream = field.tokenStreamValue(); TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); PayloadAttribute payloadAttribute = tokenStream .addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); tokenStream.end(); tokenStream.close(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); if (value.startsWith(namePrefix)) { // extract value String rawValue = value.substring(namePrefix.length()); // create new named value Path p = getRelativePath(state, propState); String path = getNamespaceMappings().translatePath(p); value = FieldNames.createNamedValue(path, rawValue); termAttribute.setTermBuffer(value); PropertyMetaData pdm = PropertyMetaData .fromByteArray(payloadAttribute.getPayload().getData()); doc.add(new Field(field.name(), new SingletonTokenStream(value, pdm.getPropertyType()))); doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, parent.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); if (pdm.getPropertyType() == PropertyType.STRING) { // add to fulltext index Field ft = new Field(FieldNames.FULLTEXT, false, rawValue, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.NO); doc.add(ft); } } } } finally { Util.disposeDocument(aDoc); } } } // only use first aggregate definition that matches if (ruleMatched) { break; } } } catch (NoSuchItemStateException e) { // do not fail if aggregate cannot be created log.info("Exception while building indexing aggregate for {}. Node is not available {}.", state.getNodeId(), e.getMessage()); } catch (Exception e) { // do not fail if aggregate cannot be created log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e); } } }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java
License:Apache License
/** * Tries to merge back tokens that are split on relevant fulltext query * wildcards ('*' or '?')//from www .j a v a 2 s. c om * * * @param text * @param analyzer * @return */ static List<String> tokenize(String text, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); TokenStream stream = null; try { stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); // TypeAttribute type = stream.addAttribute(TypeAttribute.class); stream.reset(); int poz = 0; boolean hasFulltextToken = false; StringBuilder token = new StringBuilder(); while (stream.incrementToken()) { String term = termAtt.toString(); int start = offsetAtt.startOffset(); int end = offsetAtt.endOffset(); if (start > poz) { for (int i = poz; i < start; i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); hasFulltextToken = true; } } } } poz = end; if (hasFulltextToken) { token.append(term); hasFulltextToken = false; } else { if (token.length() > 0) { tokens.add(token.toString()); } token = new StringBuilder(); token.append(term); } } // consume to the end of the string if (poz < text.length()) { for (int i = poz; i < text.length(); i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); } } } } if (token.length() > 0) { tokens.add(token.toString()); } stream.end(); } catch (IOException e) { LOG.error("Building fulltext query failed", e.getMessage()); return null; } finally { try { if (stream != null) { stream.close(); } } catch (IOException e) { // ignore } } return tokens; }
From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java
License:Apache License
@Test public void testAnalysis() throws Exception { Analyzer analyzer = new MailArchivesClusteringAnalyzer(); String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n" + "Mahout is a scalable, machine-learning LIBRARY\n" + "we've added some additional stopwords such as html, mailto, regards\t" + "apache_hadoop provides the foundation for scalability\n" + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package"; Reader reader = new StringReader(text); // if you change the text above, then you may need to change this as well // order matters too String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", "stopword", "apache_hadoop", "provid", "foundat", "scalabl" }; TokenStream tokenStream = analyzer.tokenStream("test", reader); assertNotNull(tokenStream);/*from w w w . j a v a 2s. c om*/ tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); int e = 0; while (tokenStream.incrementToken() && e < expectedTokens.length) { assertEquals(expectedTokens[e++], termAtt.toString()); } assertEquals(e, expectedTokens.length); tokenStream.end(); tokenStream.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** normal case, unfiltered analyzer */ @Test/* w w w. j a v a 2 s. c o m*/ public void testAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); validateTokens(allTokens, ts); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** filtered analyzer */ @Test//from w ww .ja va 2s. co m public void testNonKeepdAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts); validateTokens(expectedNonKeepTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** keep analyzer */ @Test//from w w w . j a va 2 s. c o m public void testKeepAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts); validateTokens(expectedKeepTokens, f); ts.end(); ts.close(); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
/** shingles, keep those matching whitelist */ @Test//from w w w . j a v a 2 s . co m public void testShingleFilteredAnalyzer() throws IOException { Reader reader = new StringReader(input); Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46); TokenStream ts = analyzer.tokenStream(null, reader); ts.reset(); ShingleFilter sf = new ShingleFilter(ts, 3); TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf); validateTokens(expectedShingleTokens, f); ts.end(); ts.close(); }
From source file:org.apache.maven.index.DefaultQueryCreator.java
License:Apache License
protected int countTerms(final IndexerField indexerField, final String query) { try {//from w w w.j a v a 2s.c om TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query)); ts.reset(); int result = 0; while (ts.incrementToken()) { result++; } ts.end(); ts.close(); return result; } catch (IOException e) { // will not happen return 1; } }
From source file:org.apache.nutch.summary.basic.BasicSummarizer.java
License:Apache License
private Token[] getTokens(String text) { ArrayList result = new ArrayList(); TokenStream ts = analyzer.tokenStream("content", new StringReader(text)); Token token = null;//w w w . j a va2 s. c o m while (result.size() < token_deep) { try { token = ts.next(); } catch (IOException e) { token = null; } if (token == null) { break; } result.add(token); } try { ts.close(); } catch (IOException e) { // ignore } return (Token[]) result.toArray(new Token[result.size()]); }