Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java

License:Apache License

/**
 * Tries to merge back tokens that are split on relevant fulltext query
 * wildcards ('*' or '?')// w  w  w. j a  v a 2s . co m
 *
 *
 * @param text
 * @param analyzer
 * @return
 */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        stream.reset();

        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return tokens;
}

From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    StringBuilder contents = new StringBuilder();
    String document = value.toString();
    String catMatch = findMatchingCategory(document);

    if (!"Unknown".equals(catMatch)) {
        document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN
                .matcher(WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
                .replaceAll(""));
        TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        while (stream.incrementToken()) {
            contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
        }//ww  w.  j a  va2s  . c  o  m
        context.write(new Text(
                WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
                new Text(contents.toString()));
    }
}

From source file:org.apache.mahout.classifier.BayesFileFormatter.java

License:Apache License

/**
 * Write the tokens and the label from the Reader to the writer
 * //w  w  w. j  a  v  a 2s .  co  m
 * @param label
 *          The label
 * @param analyzer
 *          The analyzer to use
 * @param inFile
 *          the file to read and whose contents are passed to the analyzer
 * @param charset
 *          character encoding to assume when reading the input file
 * @param writer
 *          The Writer, is not closed by this method
 * @throws java.io.IOException
 *           if there was a problem w/ the reader
 */
private static void writeFile(String label, Analyzer analyzer, File inFile, Charset charset, Writer writer)
        throws IOException {
    Reader reader = new InputStreamReader(new FileInputStream(inFile), charset);
    try {
        TokenStream ts = analyzer.tokenStream(label, reader);
        writer.write(label);
        writer.write('\t'); // edit: Inorder to match Hadoop standard
        // TextInputFormat
        TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.termBuffer();
            int termLen = termAtt.termLength();
            writer.write(termBuffer, 0, termLen);
            writer.write(' ');
        }
    } finally {
        IOUtils.quietClose(reader);
    }
}

From source file:org.apache.mahout.classifier.BayesFileFormatter.java

License:Apache License

/**
 * Convert a Reader to a vector/*  w  w  w  . j  a v  a2  s  .  com*/
 * 
 * @param analyzer
 *          The Analyzer to use
 * @param reader
 *          The reader to feed to the Analyzer
 * @return An array of unique tokens
 */
public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
    TokenStream ts = analyzer.tokenStream("", reader);

    List<String> coll = new ArrayList<String>();
    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        char[] termBuffer = termAtt.termBuffer();
        int termLen = termAtt.termLength();
        String val = new String(termBuffer, 0, termLen);
        coll.add(val);
    }
    return coll.toArray(new String[coll.size()]);
}

From source file:org.apache.mahout.classifier.NewsgroupHelper.java

License:Apache License

public static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*  w  ww .  j a  v a 2s  .  c o  m*/
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
    ts.end();
    Closeables.close(ts, true);
}

From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.reusableTokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*www  .  j av a 2 s . co m*/
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);/* w  w  w .j  a  va 2  s. c o  m*/
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java

License:Apache License

@Test
public void testAnalysis() throws Exception {
    Analyzer analyzer = new MailArchivesClusteringAnalyzer();

    String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
            + "Mahout is a scalable, machine-learning LIBRARY\n"
            + "we've added some additional stopwords such as html, mailto, regards\t"
            + "apache_hadoop provides the foundation for scalability\n"
            + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package";
    Reader reader = new StringReader(text);

    // if you change the text above, then you may need to change this as well
    // order matters too
    String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
            "stopword", "apache_hadoop", "provid", "foundat", "scalabl" };

    TokenStream tokenStream = analyzer.tokenStream("test", reader);
    assertNotNull(tokenStream);/*from   w w w  .j a v  a2s .  com*/
    tokenStream.reset();
    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    int e = 0;
    while (tokenStream.incrementToken() && e < expectedTokens.length) {
        assertEquals(expectedTokens[e++], termAtt.toString());
    }
    assertEquals(e, expectedTokens.length);
    tokenStream.end();
    tokenStream.close();
}

From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String document = value.toString();
    document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN
            .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
    String catMatch = findMatchingCategory(document);
    if (!"Unknown".equals(catMatch)) {
        StringBuilder contents = new StringBuilder(1000);
        TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();/*from   w ww . j  a v  a  2s  .  co  m*/
        while (stream.incrementToken()) {
            contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
        }
        context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
                new Text(contents.toString()));
        stream.end();
        Closeables.close(stream, true);
    }
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

private static void validateTokens(String[] expected, TokenStream ts) throws IOException {
    int pos = 0;/*from   w  ww .jav  a  2s.c  o m*/
    while (ts.incrementToken()) {
        assertTrue("Analyzer produced too many tokens", pos <= expected.length);
        CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
        assertEquals("Unexpected term", expected[pos++], termAttr.toString());
    }
    assertEquals("Analyzer produced too few terms", expected.length, pos);
}