List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java
License:Apache License
/** * Merges the fulltext indexed fields of the aggregated node states into * <code>doc</code>./* w w w.jav a 2 s .c om*/ * * @param state the node state on which <code>doc</code> was created. * @param doc the lucene document with index fields from <code>state</code>. * @param ifv the current index format version. */ protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) { if (indexingConfig != null) { AggregateRule[] aggregateRules = indexingConfig.getAggregateRules(); if (aggregateRules == null) { return; } try { ItemStateManager ism = getContext().getItemStateManager(); for (AggregateRule aggregateRule : aggregateRules) { boolean ruleMatched = false; // node includes NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state); if (aggregates != null) { ruleMatched = true; for (NodeState aggregate : aggregates) { Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv); // transfer fields to doc if there are any Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT); if (fulltextFields != null) { for (Fieldable fulltextField : fulltextFields) { doc.add(fulltextField); } doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, aggregate.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); } } // make sure that fulltext fields are aligned properly // first all stored fields, then remaining Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT); doc.removeFields(FieldNames.FULLTEXT); Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED); for (Fieldable f : fulltextFields) { doc.add(f); } } // property includes PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state); if (propStates != null) { ruleMatched = true; for (PropertyState propState : propStates) { String namePrefix = FieldNames.createNamedValue( getNamespaceMappings().translateName(propState.getName()), ""); NodeState parent = (NodeState) ism.getItemState(propState.getParentId()); Document aDoc = createDocument(parent, getNamespaceMappings(), ifv); try { // find the right fields to transfer Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES); for (Fieldable field : fields) { // assume properties fields use SingleTokenStream TokenStream tokenStream = field.tokenStreamValue(); TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); PayloadAttribute payloadAttribute = tokenStream .addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); tokenStream.end(); tokenStream.close(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); if (value.startsWith(namePrefix)) { // extract value String rawValue = value.substring(namePrefix.length()); // create new named value Path p = getRelativePath(state, propState); String path = getNamespaceMappings().translatePath(p); value = FieldNames.createNamedValue(path, rawValue); termAttribute.setTermBuffer(value); PropertyMetaData pdm = PropertyMetaData .fromByteArray(payloadAttribute.getPayload().getData()); doc.add(new Field(field.name(), new SingletonTokenStream(value, pdm.getPropertyType()))); doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, parent.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); if (pdm.getPropertyType() == PropertyType.STRING) { // add to fulltext index Field ft = new Field(FieldNames.FULLTEXT, false, rawValue, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.NO); doc.add(ft); } } } } finally { Util.disposeDocument(aDoc); } } } // only use first aggregate definition that matches if (ruleMatched) { break; } } } catch (NoSuchItemStateException e) { // do not fail if aggregate cannot be created log.info("Exception while building indexing aggregate for {}. Node is not available {}.", state.getNodeId(), e.getMessage()); } catch (Exception e) { // do not fail if aggregate cannot be created log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e); } } }
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java
License:Apache License
/** * Tries to merge back tokens that are split on relevant fulltext query * wildcards ('*' or '?')/*from w w w . j a va2 s. com*/ * * * @param text * @param analyzer * @return */ static List<String> tokenize(String text, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); TokenStream stream = null; try { stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); // TypeAttribute type = stream.addAttribute(TypeAttribute.class); stream.reset(); int poz = 0; boolean hasFulltextToken = false; StringBuilder token = new StringBuilder(); while (stream.incrementToken()) { String term = termAtt.toString(); int start = offsetAtt.startOffset(); int end = offsetAtt.endOffset(); if (start > poz) { for (int i = poz; i < start; i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); hasFulltextToken = true; } } } } poz = end; if (hasFulltextToken) { token.append(term); hasFulltextToken = false; } else { if (token.length() > 0) { tokens.add(token.toString()); } token = new StringBuilder(); token.append(term); } } // consume to the end of the string if (poz < text.length()) { for (int i = poz; i < text.length(); i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); } } } } if (token.length() > 0) { tokens.add(token.toString()); } stream.end(); } catch (IOException e) { LOG.error("Building fulltext query failed", e.getMessage()); return null; } finally { try { if (stream != null) { stream.close(); } } catch (IOException e) { // ignore } } return tokens; }
From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { StringBuilder contents = new StringBuilder(); String document = value.toString(); String catMatch = findMatchingCategory(document); if (!"Unknown".equals(catMatch)) { document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN .matcher(WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")) .replaceAll("")); TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); while (stream.incrementToken()) { contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' '); }/* ww w. ja va 2 s. c o m*/ context.write(new Text( WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString())); } }
From source file:org.apache.mahout.classifier.BayesFileFormatter.java
License:Apache License
/** * Write the tokens and the label from the Reader to the writer * /*from ww w. j a v a 2 s .c o m*/ * @param label * The label * @param analyzer * The analyzer to use * @param inFile * the file to read and whose contents are passed to the analyzer * @param charset * character encoding to assume when reading the input file * @param writer * The Writer, is not closed by this method * @throws java.io.IOException * if there was a problem w/ the reader */ private static void writeFile(String label, Analyzer analyzer, File inFile, Charset charset, Writer writer) throws IOException { Reader reader = new InputStreamReader(new FileInputStream(inFile), charset); try { TokenStream ts = analyzer.tokenStream(label, reader); writer.write(label); writer.write('\t'); // edit: Inorder to match Hadoop standard // TextInputFormat TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); writer.write(termBuffer, 0, termLen); writer.write(' '); } } finally { IOUtils.quietClose(reader); } }
From source file:org.apache.mahout.classifier.BayesFileFormatter.java
License:Apache License
/** * Convert a Reader to a vector// w w w . j av a2 s. c o m * * @param analyzer * The Analyzer to use * @param reader * The reader to feed to the Analyzer * @return An array of unique tokens */ public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException { TokenStream ts = analyzer.tokenStream("", reader); List<String> coll = new ArrayList<String>(); TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); String val = new String(termBuffer, 0, termLen); coll.add(val); } return coll.toArray(new String[coll.size()]); }
From source file:org.apache.mahout.classifier.NewsgroupHelper.java
License:Apache License
public static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset();//w w w .j av a 2s. c o m while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } overallCounts.addAll(words); ts.end(); Closeables.close(ts, true); }
From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.reusableTokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset();//from w w w . j ava 2 s . com while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } overallCounts.addAll(words); }
From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);// w w w . ja v a 2 s . co m } overallCounts.addAll(words); }
From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java
License:Apache License
@Test public void testAnalysis() throws Exception { Analyzer analyzer = new MailArchivesClusteringAnalyzer(); String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n" + "Mahout is a scalable, machine-learning LIBRARY\n" + "we've added some additional stopwords such as html, mailto, regards\t" + "apache_hadoop provides the foundation for scalability\n" + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package"; Reader reader = new StringReader(text); // if you change the text above, then you may need to change this as well // order matters too String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", "stopword", "apache_hadoop", "provid", "foundat", "scalabl" }; TokenStream tokenStream = analyzer.tokenStream("test", reader); assertNotNull(tokenStream);//from w w w . ja va2s. c om tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); int e = 0; while (tokenStream.incrementToken() && e < expectedTokens.length) { assertEquals(expectedTokens[e++], termAtt.toString()); } assertEquals(e, expectedTokens.length); tokenStream.end(); tokenStream.close(); }
From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String document = value.toString(); document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll("")); String catMatch = findMatchingCategory(document); if (!"Unknown".equals(catMatch)) { StringBuilder contents = new StringBuilder(1000); TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();// ww w .j av a2 s .c o m while (stream.incrementToken()) { contents.append(termAtt.buffer(), 0, termAtt.length()).append(' '); } context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString())); stream.end(); Closeables.close(stream, true); } }