Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:drakkar.mast.retrieval.analysis.NGramQuery.java

/**
 *
 * @param analyzer//from   ww w .  ja  v  a 2s .  com
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer analyzer, String queryTerm, String field) throws IOException {
    String words[] = null;

    //remove white spaces
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }

    //one term
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        //more than one term
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = analyzer.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:drakkar.mast.retrieval.ngram.NGramQuery.java

/**
 *
 * @param a/* w  w  w  .  jav a2 s. com*/
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer a, String queryTerm, String field) throws IOException {
    String words[] = null;
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = a.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java

License:Open Source License

protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) {
    if (reader != null && searcher != null) {
        double tf;
        double idf;
        double tfidf;
        EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40);
        TokenStream stream = null;
        CharTermAttribute termAtt;//from   w  w  w .  j av  a  2 s. c  o  m
        String term;
        double totalWikiDocs = (double) reader.numDocs();
        for (TFIDFTerm word : wordList) {
            try {
                term = "";
                stream = analyzer.tokenStream("field", new StringReader(word.word));
                termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                // print all tokens until stream is exhausted
                while (stream.incrementToken()) {
                    term += (termAtt.toString());
                }
                //                System.out.println(term);
                stream.end();
                tf = (double) word.count / (double) totalWordsDoc;
                double wikiTermFrec = reader.docFreq(new Term("contents", term));
                if (wikiTermFrec != 0) {
                    idf = Math.log(totalWikiDocs / wikiTermFrec);
                    tfidf = tf * idf;
                } else {
                    tfidf = 0;
                }
                word.tfidf = tfidf;
            } catch (IOException ex) {
                logger.error("Error processing the TFIDF", ex);
            } finally {
                try {
                    if (stream != null) {
                        stream.close();
                    }
                } catch (IOException ex) {
                    logger.error("Error processing the TFIDF", ex);
                }

            }

        }
        try {
            reader.close();
        } catch (IOException ex) {
            logger.warn("Error closing lucene reader", ex);
        }
    }
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    long initCPU = System.nanoTime();
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();//  w  w  w  .  j a v a2  s  .c  o m
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            String term = new String(termAtt.buffer(), 0, termAtt.length());
            document.add(term);
            numTerms++;
        }
    }
    elapsedTime += System.nanoTime() - initCPU;

    context.write(key, document);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();// w w  w. j a v a2s .c o m
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorUtil.java

License:Apache License

public static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from ww  w . j a v a2  s.co  m*/
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.util.filter.DictionaryFilter.java

License:Apache License

protected DictionaryFilter(TokenStream input) {
    super(input);
    termAtt = input.addAttribute(CharTermAttribute.class);

    //      InputStream dictIn = getClass().getClassLoader().getResourceAsStream(
    //            "dict-ubuntu.txt"); 
    InputStream dictIn = getClass().getClassLoader().getResourceAsStream("dictionary.txt");
    BufferedReader reader = new BufferedReader(new InputStreamReader(dictIn));
    try {//from  w  ww . j  av a 2 s  .  c  o m
        String term = null;
        while ((term = reader.readLine()) != null) {
            dictionary.add(term);
        }
        reader.close();
    } catch (IOException e) {

    } finally {

    }

}

From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzerTest.java

License:Apache License

public static void main(String[] args) throws IOException {
    HTRCFilterAnalyzer analyzer = new HTRCFilterAnalyzer();

    TokenStream stream = analyzer.reusableTokenStream("field",
            new StringReader("a iss Pierre 1 Vinken , 61 years old , "
                    + "will join the board as joins a nonexecutive joining director Nov. "
                    + "29 .Mr. car Vinken is cars chairman of Elsevier N.V. , the Dutch "
                    + "publishing group ."));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from  w  w w.jav a 2  s  . co m
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            System.out.println(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }

    System.out.println("Done???");
}

From source file:edu.indiana.d2i.htrc.util.filter.POSFilter.java

License:Apache License

protected POSFilter(TokenStream input, String[] regex) {
    super(input);
    termAtt = input.addAttribute(CharTermAttribute.class);
    for (int i = 0; i < regex.length; i++)
        if (regex[i].length() > 0)
            patterns.add(Pattern.compile(regex[i]));
    initOpenNlp();//ww w.  ja va  2s .  c o m
}

From source file:edu.indiana.d2i.htrc.util.filter.RegexpFilter.java

License:Apache License

protected RegexpFilter(TokenStream input, String[] regex) {
    super(input);
    termAtt = input.addAttribute(CharTermAttribute.class);
    for (int i = 0; i < regex.length; i++)
        if (regex[i].length() > 0)
            patterns.add(Pattern.compile(regex[i]));
}