Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java

License:Apache License

/**
 * //ww  w  . j av  a  2s.  c  o m
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    //System.out.println( "> ----- countWords ------" );

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        //System.out.print( " " + s );
        words.add(s);
    }

    //System.out.println( "\n<" );

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        // System.out.print( " " + s );
        words.add(s);//from w  w  w .  j a  va2 s  .c om
    }

}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException {

    String out = newsgroup_name + "\t";
    BufferedReader reader = null;
    // Collection<String> words

    Multiset<String> words = ConcurrentHashMultiset.create();

    try {/* w  ww .  j  a  v a2  s .c  o  m*/
        reader = new BufferedReader(new FileReader(file));

        TokenStream ts = analyzer.tokenStream("text", reader);
        ts.addAttribute(CharTermAttribute.class);

        // for each word in the stream, minus non-word stuff, add word to
        // collection
        while (ts.incrementToken()) {
            String s = ts.getAttribute(CharTermAttribute.class).toString();
            out += s + " ";
        }

    } finally {
        if (reader != null) {
            reader.close();
        }
    }

    return out + "\n";

}

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//from w  w  w. j a  v  a 2s .c om
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;//  ww  w . j  ava 2  s . c o  m
    }
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.digitalpebble.classification.example.TwentyNewsgroups.java

License:Apache License

private List<String> analyseField(String content) throws IOException {
    if (content == null)
        return null;
    List<String> tokens = new ArrayList<String>();
    StringReader sr = new StringReader(content);
    TokenStream ts = analyzer.tokenStream("dummyValue", sr);
    TermAttribute term = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        tokens.add(term.term());//from ww  w . j a  v a  2  s.  com
    }
    return tokens;
}

From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java

License:Open Source License

public static String analyze(String str, Analyzer analyzer) throws IOException {
    if (analyzer == null) {
        return str;
    }//from ww  w.j a va  2s  .c o m
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(str));
    tokens.reset();

    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    return norm.toString();
}

From source file:com.doculibre.constellio.utils.AnalyzerUtils.java

License:Open Source License

public static String analyzePhrase(String phrase, boolean useStopWords) {
    if (StringUtils.isNotBlank(phrase)) {
        String analysedPhrase;/*from ww w . jav  a 2 s  . com*/
        Analyzer analyzer = getDefaultAnalyzer(useStopWords);

        StringBuilder norm = new StringBuilder();
        TokenStream tokens;
        try {
            tokens = analyzer.tokenStream("", new StringReader(phrase));
            tokens.reset();

            CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
            while (tokens.incrementToken()) {
                norm.append(termAtt.buffer(), 0, termAtt.length());
            }

            analysedPhrase = norm.toString().trim();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return analysedPhrase;
    } else {
        return phrase;
    }
}

From source file:com.faqit.similarity.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text. Can handle ngrams of any length
 * and also perform stop word removal before extraction
 * /*from ww w.  j  a  va 2  s .  c om*/
 * @param text
 *            the text that the ngrams should be extracted from
 * @param length
 *            the length of the ngrams
 * @param stopWords
 *            whether or not stopwords should be removed before extraction
 * @param overlap
 *            whether or not the ngrams should overlap
 */
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /*
     * If the minLength and maxLength are both 1, then we want unigrams Make
     * use of a StopAnalyzer when stopwords should be removed Make use of a
     * SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer();
        } else {
            analyzer = new SimpleAnalyzer();
        }
    } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once
             // again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a
            // hack to use
            // Lucene 2.4
            // since in 2.4
            // position
            // increments
            // weren't
            // preserved by
            // default.
            // Using a later
            // version puts
            // underscores
            // (_) in the
            // place of
            // removed stop
            // words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, "");
        }
    }

    // Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    // OffsetAttribute offsetAttribute =
    // tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    // int tokenCount = 0;
    tokenStream.reset();
    while (tokenStream.incrementToken()) {

        // int startOffset = offsetAttribute.startOffset();
        // int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); // The actual token
        // term
        nGrams.add(termToken); // Add all ngrams to the ngram LinkedList

        // If n-grams are not allowed to overlap, then increment to point of
        // no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    // Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:com.finderbots.miner.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
    TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);

    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {//from www  .j  av a 2s.c  o  m
        while (stream.incrementToken()) {
            if (termAtt.termLength() > 0) {
                String term = termAtt.term();
                result.add(term);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}