Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:modnlp.idx.inverted.TokeniserJPLucene.java

License:Open Source License

public TokenIndex getTokenIndex(String str) {
    TokenIndex ret = new TokenIndex();
    try {/*from   ww w .  jav a  2  s. c o m*/
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(str), null, true,
                org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH);
        TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
        //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
        stream = new CJKWidthFilter(stream);
        //stream = new StopFilter(matchVersion, stream, stopwords);
        stream = new JapaneseKatakanaStemFilter(stream);
        //stream = new LowerCaseFilter(matchVersion, stream);

        OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String token = charTermAttribute.toString();
            ret.add(startOffset, endOffset);
            //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end);
        }
    } catch (java.io.IOException e) {
        System.err.println(e);
    }
    return ret;
}

From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java

License:Open Source License

public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max,
        int threshold, HashMap<String, String> attributes) {
    float searchThreshold = (float) threshold;
    if (threshold < 0)
        searchThreshold = 0.0f;//from   w w  w .  jav  a 2s .com
    if (threshold > 100)
        searchThreshold = 100.0f;

    String queryText = genericText;

    String gtextFName = TmEntry.GTEXT_PREFIX + locale;
    Locale javaLoc = new Locale(locale);

    // create basic ngram analyzer to tokenize query
    TokenStream queryTokenStream;
    if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) {
        queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText));
    } else {
        queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText));
    }

    // Get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class);
    TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName);

    try {
        queryTokenStream.reset();
        while (queryTokenStream.incrementToken()) {
            //Term t = new Term(keyIndexField, new String(termAtt.buffer()));
            Term t = new Term(gtextFName, termAtt.toString());
            fQuery.add(t);
        }
        queryTokenStream.end();
        queryTokenStream.close();
    } catch (IOException e) {
        throw new OkapiIOException(e.getMessage(), e);
    }

    return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes);
}

From source file:NewsIR_search.TRECQuery.java

/**
 * Returns the content of the 'queryField' from the query text
 * @param analyzer//w ww.java  2  s  .c om
 * @param queryField
 * @return (String) The content of the field
 * @throws Exception 
 */
public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}

From source file:ngram.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param length the length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 * @param overlap whether or not the ngrams should overlap
 *///from w  w  w .  j  a v  a 2  s . c om
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            analyzer = new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET); //Changed from simple to standard to include apostrophe/s
        }
    } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_24), length, length, " ",
                    false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(
                    new StandardAnalyzer(Version.LUCENE_36, Collections.EMPTY_SET), length, length, " ", false,
                    false); //Changed from simple to standard to include apostrophe/s
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    while (tokenStream.incrementToken()) {

        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

        //If n-grams are not allowed to overlap, then increment to point of no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    //Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java

private String transformation(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from   w w  w  . ja v  a2  s  . c  om*/
    int s = 0;
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                //                    System.out.println(k);
            }
        }
    }
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    return q;
}

From source file:nicta.com.au.failureanalysis.query.QueryGneration.java

private Map<String, Integer> getTerms(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    Map<String, Integer> qterm_freq = new HashMap<>();

    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from w  w  w. j  a v a2  s .  c  o m*/
    int s = 0;
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    //        int count = 0;
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                qterm_freq.put(k, m.get(k));
                //                    count++;
                //                    System.out.println(count + " " + k + " " + m.get(k));
            }
        }
    }
    //        System.out.println("-------------------");
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    //        return q;
    return qterm_freq;
}

From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java

private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception {
    Map<String, Double> m = new HashMap<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from  w ww  .ja va 2  s. c  om*/
    int i = 0;
    while (ts.incrementToken()) {
        i++;
        String term = charTermAttribute.toString();
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1.0);
        }
    }
    for (String key : m.keySet()) {
        Term t = new Term(field, key);
        int totalTF = ir.docFreq(t);
        int docs = ir.getDocCount("claims");
        double idf = Math.log10((double) docs / (totalTF + 1));
        m.put(key, (m.get(key) / i) * idf);
    }

    return m;
}

From source file:nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java

private Map<String, Integer> transformation(TokenStream ts) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();// w  ww .  j  av a  2  s  . co m
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString();
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
    }
    return m;
}

From source file:nicta.com.au.patent.pac.analysis.RecallAnalysis.java

private Set<String> transformation(TokenStream ts) throws IOException {
    Set<String> out = new HashSet<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  w w  w.ja  v a  2  s .co m
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString();
        out.add(term);
    }
    return out;
}

From source file:nicta.com.au.patent.pac.search.PatentQuery.java

private String transformation(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from w  w  w.  j a v  a 2s .  c o  m
    int s = 0;
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                //                    q += k + "^" + m.get(k) + " ";
                q += k + "^" + 1/*m.get(k)*/ + " ";
                //                    System.out.println(k);
            }
        }
    }
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    return q;
}