Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java

License:Apache License

/**
 * //ww  w  . j av  a  2s.  c  o m
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    //System.out.println( "> ----- countWords ------" );

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        //System.out.print( " " + s );
        words.add(s);
    }

    //System.out.println( "\n<" );

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        // System.out.print( " " + s );
        words.add(s);//from w  w  w .  j a  va2 s  .c om
    }

}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException {

    String out = newsgroup_name + "\t";
    BufferedReader reader = null;
    // Collection<String> words

    Multiset<String> words = ConcurrentHashMultiset.create();

    try {/* w  ww .  j  a  v a2  s .c  o  m*/
        reader = new BufferedReader(new FileReader(file));

        TokenStream ts = analyzer.tokenStream("text", reader);
        ts.addAttribute(CharTermAttribute.class);

        // for each word in the stream, minus non-word stuff, add word to
        // collection
        while (ts.incrementToken()) {
            String s = ts.getAttribute(CharTermAttribute.class).toString();
            out += s + " ";
        }

    } finally {
        if (reader != null) {
            reader.close();
        }
    }

    return out + "\n";

}

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//from w  w  w. j a  v  a 2s .c om
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;//  ww  w . j  ava 2  s . c o  m
    }
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.digitalpebble.classification.example.TwentyNewsgroups.java

License:Apache License

private List<String> analyseField(String content) throws IOException {
    if (content == null)
        return null;
    List<String> tokens = new ArrayList<String>();
    StringReader sr = new StringReader(content);
    TokenStream ts = analyzer.tokenStream("dummyValue", sr);
    TermAttribute term = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(term.term());//from ww  w . j a  v a  2  s.  com
    }
    return tokens;
}

From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java

License:Open Source License

public static String analyze(String str, Analyzer analyzer) throws IOException {
    if (analyzer == null) {
        return str;
    }//from ww  w.j a va  2s  .c o m
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(str));
    tokens.reset();

    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    return norm.toString();
}

From source file:com.doculibre.constellio.utils.AnalyzerUtils.java

License:Open Source License

public static String analyzePhrase(String phrase, boolean useStopWords) {
    if (StringUtils.isNotBlank(phrase)) {
        String analysedPhrase;/*from ww w . jav  a 2 s  . com*/
        Analyzer analyzer = getDefaultAnalyzer(useStopWords);

        StringBuilder norm = new StringBuilder();
        TokenStream tokens;
        try {
            tokens = analyzer.tokenStream("", new StringReader(phrase));
            tokens.reset();

            CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
            while (tokens.incrementToken()) {
                norm.append(termAtt.buffer(), 0, termAtt.length());
            }

            analysedPhrase = norm.toString().trim();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return analysedPhrase;
    } else {
        return phrase;
    }
}

From source file:com.faqit.similarity.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text. Can handle ngrams of any length
 * and also perform stop word removal before extraction
 * /*from ww w.  j  a  va 2  s .  c om*/
 * @param text
 *            the text that the ngrams should be extracted from
 * @param length
 *            the length of the ngrams
 * @param stopWords
 *            whether or not stopwords should be removed before extraction
 * @param overlap
 *            whether or not the ngrams should overlap
 */
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /*
     * If the minLength and maxLength are both 1, then we want unigrams Make
     * use of a StopAnalyzer when stopwords should be removed Make use of a
     * SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer();
        } else {
            analyzer = new SimpleAnalyzer();
        }
    } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once
             // again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a
            // hack to use
            // Lucene 2.4
            // since in 2.4
            // position
            // increments
            // weren't
            // preserved by
            // default.
            // Using a later
            // version puts
            // underscores
            // (_) in the
            // place of
            // removed stop
            // words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, "");
        }
    }

    // Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    // OffsetAttribute offsetAttribute =
    // tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    // int tokenCount = 0;
    tokenStream.reset();
    while (tokenStream.incrementToken()) {

        // int startOffset = offsetAttribute.startOffset();
        // int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); // The actual token
        // term
        nGrams.add(termToken); // Add all ngrams to the ngram LinkedList

        // If n-grams are not allowed to overlap, then increment to point of
        // no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    // Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:com.finderbots.miner.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
    TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);

    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {//from www  .j  av a 2s.c  o  m
        while (stream.incrementToken()) {
            if (termAtt.termLength() > 0) {
                String term = termAtt.term();
                result.add(term);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}