Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.chriscx.stem.Stem.java

public String evaluate(BufferedReader input) {
    if (input == null) {
        return null;
    }/*from  ww w . j  ava 2 s  .  c  om*/

    CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true);
    String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi "
            + "autre autres aux auxquelles auxquels avait avant avec c car ce "
            + "ceci cela celle celles celui cependant certain certaine certaines "
            + "certains ces cet cette ceux chez ci combien comme comment "
            + "concernant contre d dans de debout dedans dehors del depuis "
            + "derrire des dsormais desquelles desquels devers devra doit "
            + "donc dont du duquel durant ds elle elles en entre environ est"
            + " et etc eu eux except hormis hors hlas hui il ils j je jusqu "
            + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs "
            + "lorsque lui l ma mais malgr me merci mes mien mienne miennes "
            + "miens moins mon moyennant mme mmes n ne ni non nos notre nous "
            + "nanmoins ntre ntres on ont ou outre o par parmi partant pas "
            + "pass pendant plein plus plusieurs pour pourquoi proche prs "
            + "puisque qu quand que quel quelle quelles quels qui quoi quoique"
            + " revoici revoil s sa sans sauf se selon seront ses si sien "
            + "sienne siennes siens sinon soi soit son sont sous suivant sur "
            + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes"
            + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre ";
    String[] stopWordsTab = stopWords.split(" ");
    for (String word : stopWordsTab) {
        stopWordsSet.add(word);
    }

    Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet);

    result = "";
    try {
        String line = input.readLine();

        line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", "");
        line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
        line = line.replaceAll("(_|-)+", "");
        line = line.replaceAll("(\\n|\\r|\\t)+", "");
        line = line.replaceAll("(?![\\._])\\p{P}", "");
        while (line != null) {

            TokenStream stream = analyzer.tokenStream(null, line);
            stream.reset();
            while (stream.incrementToken()) {
                String wordset = stream.getAttribute(CharTermAttribute.class).toString();
                wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
                result += wordset + " ";
            }
            result += "\n";
            stream.close();
            line = input.readLine();
        }

        input.close();
        return result;
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
}

From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java

License:Apache License

/**
 * /*from w  ww .  j a v a2 s.c o m*/
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    System.out.println("> ----- countWords ------");

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        System.out.print(" " + s);
        words.add(s);
    }

    System.out.println("\n<");

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);//www . j  a v a  2s  . com
    }

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java

License:Apache License

/**
 * //from  w  ww  . ja v a 2  s  . c  om
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    //System.out.println( "> ----- countWords ------" );

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        //System.out.print( " " + s );
        words.add(s);
    }

    //System.out.println( "\n<" );

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        // System.out.print( " " + s );
        words.add(s);//from  ww w.  ja v  a 2s. co m
    }

}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException {

    String out = newsgroup_name + "\t";
    BufferedReader reader = null;
    // Collection<String> words

    Multiset<String> words = ConcurrentHashMultiset.create();

    try {/*from   ww w  .ja  va  2  s. co  m*/
        reader = new BufferedReader(new FileReader(file));

        TokenStream ts = analyzer.tokenStream("text", reader);
        ts.addAttribute(CharTermAttribute.class);

        // for each word in the stream, minus non-word stuff, add word to
        // collection
        while (ts.incrementToken()) {
            String s = ts.getAttribute(CharTermAttribute.class).toString();
            out += s + " ";
        }

    } finally {
        if (reader != null) {
            reader.close();
        }
    }

    return out + "\n";

}

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//  w  ww .j av  a2s.  c o m
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java

License:Apache License

/**
 *  Retrieve the tokens in a String. Behaves like getTokens, but operates on
 *  a string instead of a tweet object./* w w  w. j ava  2  s .c o m*/
 * 
 *  @param  text    The text to tokenize.
 *  @return         The tokens in the text.
 */

// Version 1
/*public LinkedList<String> getTokens (String text) {
LinkedList<String> tokens   = new LinkedList();
String[] words              = text.split(" ");
tokens.addAll(Arrays.asList(words));
return tokens;
}*/

// Version 2
public static LinkedList<String> getTokens(String text) throws IOException {
    LinkedList<String> tokens = new LinkedList();
    TokenStream ts = new StandardTokenizer(new StringReader(text));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(termAtt.term());
        //System.out.print(termAtt.term());
    }
    return tokens;
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;/*w w  w  .  ja v a 2s  . c o m*/
    }
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.digitalpebble.classification.example.TwentyNewsgroups.java

License:Apache License

private List<String> analyseField(String content) throws IOException {
    if (content == null)
        return null;
    List<String> tokens = new ArrayList<String>();
    StringReader sr = new StringReader(content);
    TokenStream ts = analyzer.tokenStream("dummyValue", sr);
    TermAttribute term = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(term.term());/* www . j  av  a 2s .  com*/
    }
    return tokens;
}