List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java
License:Apache License
/** * Tries to merge back tokens that are split on relevant fulltext query * wildcards ('*' or '?')// w w w. j a v a 2s . co m * * * @param text * @param analyzer * @return */ static List<String> tokenize(String text, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); TokenStream stream = null; try { stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); // TypeAttribute type = stream.addAttribute(TypeAttribute.class); stream.reset(); int poz = 0; boolean hasFulltextToken = false; StringBuilder token = new StringBuilder(); while (stream.incrementToken()) { String term = termAtt.toString(); int start = offsetAtt.startOffset(); int end = offsetAtt.endOffset(); if (start > poz) { for (int i = poz; i < start; i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); hasFulltextToken = true; } } } } poz = end; if (hasFulltextToken) { token.append(term); hasFulltextToken = false; } else { if (token.length() > 0) { tokens.add(token.toString()); } token = new StringBuilder(); token.append(term); } } // consume to the end of the string if (poz < text.length()) { for (int i = poz; i < text.length(); i++) { for (char c : fulltextTokens) { if (c == text.charAt(i)) { token.append(c); } } } } if (token.length() > 0) { tokens.add(token.toString()); } stream.end(); } catch (IOException e) { LOG.error("Building fulltext query failed", e.getMessage()); return null; } finally { try { if (stream != null) { stream.close(); } } catch (IOException e) { // ignore } } return tokens; }
From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { StringBuilder contents = new StringBuilder(); String document = value.toString(); String catMatch = findMatchingCategory(document); if (!"Unknown".equals(catMatch)) { document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN .matcher(WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")) .replaceAll("")); TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); while (stream.incrementToken()) { contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' '); }//ww w. j a va2s . c o m context.write(new Text( WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString())); } }
From source file:org.apache.mahout.classifier.BayesFileFormatter.java
License:Apache License
/** * Write the tokens and the label from the Reader to the writer * //w w w. j a v a 2s . co m * @param label * The label * @param analyzer * The analyzer to use * @param inFile * the file to read and whose contents are passed to the analyzer * @param charset * character encoding to assume when reading the input file * @param writer * The Writer, is not closed by this method * @throws java.io.IOException * if there was a problem w/ the reader */ private static void writeFile(String label, Analyzer analyzer, File inFile, Charset charset, Writer writer) throws IOException { Reader reader = new InputStreamReader(new FileInputStream(inFile), charset); try { TokenStream ts = analyzer.tokenStream(label, reader); writer.write(label); writer.write('\t'); // edit: Inorder to match Hadoop standard // TextInputFormat TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); writer.write(termBuffer, 0, termLen); writer.write(' '); } } finally { IOUtils.quietClose(reader); } }
From source file:org.apache.mahout.classifier.BayesFileFormatter.java
License:Apache License
/** * Convert a Reader to a vector/* w w w . j a v a2 s . com*/ * * @param analyzer * The Analyzer to use * @param reader * The reader to feed to the Analyzer * @return An array of unique tokens */ public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException { TokenStream ts = analyzer.tokenStream("", reader); List<String> coll = new ArrayList<String>(); TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); String val = new String(termBuffer, 0, termLen); coll.add(val); } return coll.toArray(new String[coll.size()]); }
From source file:org.apache.mahout.classifier.NewsgroupHelper.java
License:Apache License
public static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset();/* w ww . j a v a 2s . c o m*/ while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } overallCounts.addAll(words); ts.end(); Closeables.close(ts, true); }
From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in, Multiset<String> overallCounts) throws IOException { TokenStream ts = analyzer.reusableTokenStream("text", in); ts.addAttribute(CharTermAttribute.class); ts.reset();/*www . j av a 2 s . co m*/ while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } overallCounts.addAll(words); }
From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java
License:Apache License
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s);/* w w w .j a va 2 s. c o m*/ } overallCounts.addAll(words); }
From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java
License:Apache License
@Test public void testAnalysis() throws Exception { Analyzer analyzer = new MailArchivesClusteringAnalyzer(); String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n" + "Mahout is a scalable, machine-learning LIBRARY\n" + "we've added some additional stopwords such as html, mailto, regards\t" + "apache_hadoop provides the foundation for scalability\n" + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package"; Reader reader = new StringReader(text); // if you change the text above, then you may need to change this as well // order matters too String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", "stopword", "apache_hadoop", "provid", "foundat", "scalabl" }; TokenStream tokenStream = analyzer.tokenStream("test", reader); assertNotNull(tokenStream);/*from w w w .j a v a2s . com*/ tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); int e = 0; while (tokenStream.incrementToken() && e < expectedTokens.length) { assertEquals(expectedTokens[e++], termAtt.toString()); } assertEquals(e, expectedTokens.length); tokenStream.end(); tokenStream.close(); }
From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String document = value.toString(); document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll("")); String catMatch = findMatchingCategory(document); if (!"Unknown".equals(catMatch)) { StringBuilder contents = new StringBuilder(1000); TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from w ww . j a v a 2s . co m*/ while (stream.incrementToken()) { contents.append(termAtt.buffer(), 0, termAtt.length()).append(' '); } context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString())); stream.end(); Closeables.close(stream, true); } }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java
License:Apache License
private static void validateTokens(String[] expected, TokenStream ts) throws IOException { int pos = 0;/*from w ww .jav a 2s.c o m*/ while (ts.incrementToken()) { assertTrue("Analyzer produced too many tokens", pos <= expected.length); CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class); assertEquals("Unexpected term", expected[pos++], termAttr.toString()); } assertEquals("Analyzer produced too few terms", expected.length, pos); }