Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:drakkar.mast.retrieval.analysis.NGramQuery.java

/**
 *
 * @param analyzer//from   ww w .  ja  v  a 2s .  com
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer analyzer, String queryTerm, String field) throws IOException {
    String words[] = null;

    //remove white spaces
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }

    //one term
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        //more than one term
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = analyzer.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:drakkar.mast.retrieval.ngram.NGramQuery.java

/**
 *
 * @param a/* w  w  w  .  jav a2 s. com*/
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer a, String queryTerm, String field) throws IOException {
    String words[] = null;
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = a.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java

License:Open Source License

protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) {
    if (reader != null && searcher != null) {
        double tf;
        double idf;
        double tfidf;
        EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40);
        TokenStream stream = null;
        CharTermAttribute termAtt;//from   w  w  w .  j av  a  2 s. c  o  m
        String term;
        double totalWikiDocs = (double) reader.numDocs();
        for (TFIDFTerm word : wordList) {
            try {
                term = "";
                stream = analyzer.tokenStream("field", new StringReader(word.word));
                termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                // print all tokens until stream is exhausted
                while (stream.incrementToken()) {
                    term += (termAtt.toString());
                }
                //                System.out.println(term);
                stream.end();
                tf = (double) word.count / (double) totalWordsDoc;
                double wikiTermFrec = reader.docFreq(new Term("contents", term));
                if (wikiTermFrec != 0) {
                    idf = Math.log(totalWikiDocs / wikiTermFrec);
                    tfidf = tf * idf;
                } else {
                    tfidf = 0;
                }
                word.tfidf = tfidf;
            } catch (IOException ex) {
                logger.error("Error processing the TFIDF", ex);
            } finally {
                try {
                    if (stream != null) {
                        stream.close();
                    }
                } catch (IOException ex) {
                    logger.error("Error processing the TFIDF", ex);
                }

            }

        }
        try {
            reader.close();
        } catch (IOException ex) {
            logger.warn("Error closing lucene reader", ex);
        }
    }
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    long initCPU = System.nanoTime();
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();//  w  w  w  .  j a v a2  s  .c  o m
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            String term = new String(termAtt.buffer(), 0, termAtt.length());
            document.add(term);
            numTerms++;
        }
    }
    elapsedTime += System.nanoTime() - initCPU;

    context.write(key, document);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();// w w  w. j a v a2s .c o m
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorUtil.java

License:Apache License

public static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from ww  w . j a v a2  s.co  m*/
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.util.filter.DictionaryFilter.java

License:Apache License

protected DictionaryFilter(TokenStream input) {
    super(input);
    termAtt = input.addAttribute(CharTermAttribute.class);

    //      InputStream dictIn = getClass().getClassLoader().getResourceAsStream(
    //            "dict-ubuntu.txt"); 
    InputStream dictIn = getClass().getClassLoader().getResourceAsStream("dictionary.txt");
    BufferedReader reader = new BufferedReader(new InputStreamReader(dictIn));
    try {//from  w  ww . j  av a 2 s  .  c  o m
        String term = null;
        while ((term = reader.readLine()) != null) {
            dictionary.add(term);
        }
        reader.close();
    } catch (IOException e) {

    } finally {

    }

}

From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzerTest.java

License:Apache License

public static void main(String[] args) throws IOException {
    HTRCFilterAnalyzer analyzer = new HTRCFilterAnalyzer();

    TokenStream stream = analyzer.reusableTokenStream("field",
            new StringReader("a iss Pierre 1 Vinken , 61 years old , "
                    + "will join the board as joins a nonexecutive joining director Nov. "
                    + "29 .Mr. car Vinken is cars chairman of Elsevier N.V. , the Dutch "
                    + "publishing group ."));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from  w  w w.jav a 2  s  . co m
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            System.out.println(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }

    System.out.println("Done???");
}

From source file:edu.indiana.d2i.htrc.util.filter.POSFilter.java

License:Apache License

protected POSFilter(TokenStream input, String[] regex) {
    super(input);
    termAtt = input.addAttribute(CharTermAttribute.class);
    for (int i = 0; i < regex.length; i++)
        if (regex[i].length() > 0)
            patterns.add(Pattern.compile(regex[i]));
    initOpenNlp();//ww w.  ja va  2s .  c o m
}

From source file:edu.indiana.d2i.htrc.util.filter.RegexpFilter.java

License:Apache License

protected RegexpFilter(TokenStream input, String[] regex) {
    super(input);
    termAtt = input.addAttribute(CharTermAttribute.class);
    for (int i = 0; i < regex.length; i++)
        if (regex[i].length() > 0)
            patterns.add(Pattern.compile(regex[i]));
}