Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.chriscx.stem.Stem.java

public String evaluate(BufferedReader input) {
    if (input == null) {
        return null;
    }/*from  ww w . j  ava 2 s  .  c  om*/

    CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true);
    String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi "
            + "autre autres aux auxquelles auxquels avait avant avec c car ce "
            + "ceci cela celle celles celui cependant certain certaine certaines "
            + "certains ces cet cette ceux chez ci combien comme comment "
            + "concernant contre d dans de debout dedans dehors del depuis "
            + "derrire des dsormais desquelles desquels devers devra doit "
            + "donc dont du duquel durant ds elle elles en entre environ est"
            + " et etc eu eux except hormis hors hlas hui il ils j je jusqu "
            + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs "
            + "lorsque lui l ma mais malgr me merci mes mien mienne miennes "
            + "miens moins mon moyennant mme mmes n ne ni non nos notre nous "
            + "nanmoins ntre ntres on ont ou outre o par parmi partant pas "
            + "pass pendant plein plus plusieurs pour pourquoi proche prs "
            + "puisque qu quand que quel quelle quelles quels qui quoi quoique"
            + " revoici revoil s sa sans sauf se selon seront ses si sien "
            + "sienne siennes siens sinon soi soit son sont sous suivant sur "
            + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes"
            + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre ";
    String[] stopWordsTab = stopWords.split(" ");
    for (String word : stopWordsTab) {
        stopWordsSet.add(word);
    }

    Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet);

    result = "";
    try {
        String line = input.readLine();

        line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", "");
        line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
        line = line.replaceAll("(_|-)+", "");
        line = line.replaceAll("(\\n|\\r|\\t)+", "");
        line = line.replaceAll("(?![\\._])\\p{P}", "");
        while (line != null) {

            TokenStream stream = analyzer.tokenStream(null, line);
            stream.reset();
            while (stream.incrementToken()) {
                String wordset = stream.getAttribute(CharTermAttribute.class).toString();
                wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
                result += wordset + " ";
            }
            result += "\n";
            stream.close();
            line = input.readLine();
        }

        input.close();
        return result;
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
}

From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java

License:Apache License

/**
 * /*from w  ww .  j a v a2 s.c o m*/
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    System.out.println("> ----- countWords ------");

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        System.out.print(" " + s);
        words.add(s);
    }

    System.out.println("\n<");

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);//www . j  a v a  2s  . com
    }

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java

License:Apache License

/**
 * //from  w  ww  . ja v a 2  s  . c  om
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    //System.out.println( "> ----- countWords ------" );

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        //System.out.print( " " + s );
        words.add(s);
    }

    //System.out.println( "\n<" );

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        // System.out.print( " " + s );
        words.add(s);//from  ww w.  ja v  a 2s. co m
    }

}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException {

    String out = newsgroup_name + "\t";
    BufferedReader reader = null;
    // Collection<String> words

    Multiset<String> words = ConcurrentHashMultiset.create();

    try {/*from   ww w  .ja  va  2  s. co  m*/
        reader = new BufferedReader(new FileReader(file));

        TokenStream ts = analyzer.tokenStream("text", reader);
        ts.addAttribute(CharTermAttribute.class);

        // for each word in the stream, minus non-word stuff, add word to
        // collection
        while (ts.incrementToken()) {
            String s = ts.getAttribute(CharTermAttribute.class).toString();
            out += s + " ";
        }

    } finally {
        if (reader != null) {
            reader.close();
        }
    }

    return out + "\n";

}

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//  w  ww .j av  a2s.  c o m
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java

License:Apache License

/**
 *  Retrieve the tokens in a String. Behaves like getTokens, but operates on
 *  a string instead of a tweet object./* w w  w. j ava  2  s .c o m*/
 * 
 *  @param  text    The text to tokenize.
 *  @return         The tokens in the text.
 */

// Version 1
/*public LinkedList<String> getTokens (String text) {
LinkedList<String> tokens   = new LinkedList();
String[] words              = text.split(" ");
tokens.addAll(Arrays.asList(words));
return tokens;
}*/

// Version 2
public static LinkedList<String> getTokens(String text) throws IOException {
    LinkedList<String> tokens = new LinkedList();
    TokenStream ts = new StandardTokenizer(new StringReader(text));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(termAtt.term());
        //System.out.print(termAtt.term());
    }
    return tokens;
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;/*w w  w  .  ja v a 2s  . c o m*/
    }
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.digitalpebble.classification.example.TwentyNewsgroups.java

License:Apache License

private List<String> analyseField(String content) throws IOException {
    if (content == null)
        return null;
    List<String> tokens = new ArrayList<String>();
    StringReader sr = new StringReader(content);
    TokenStream ts = analyzer.tokenStream("dummyValue", sr);
    TermAttribute term = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(term.term());/* www . j  av  a 2s .  com*/
    }
    return tokens;
}