Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java

License:Apache License

@Override
public String[] getQueryTokens(final String queryString) {
    TokenStream tokenStream = null;
    try {//  w w  w  .ja va2 s  . c  om
        tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString));
        tokenStream.reset();
        final ArrayList<String> al = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            final String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
            if (term != null && term.length() > 1) {
                al.add(term);
            }
        }
        if (al.size() == 0) {
            al.add(queryString);
        }

        return al.toArray(new String[al.size()]);
    } catch (final IOException e) {
        throw ADOException.of(e);
    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.close();
            } catch (final IOException e) {
            }
        }
    }
}

From source file:NewsIR_search.TRECQuery.java

/**
 * Returns the content of the 'queryField' from the query text
 * @param analyzer/*from w w w.ja va2 s.co  m*/
 * @param queryField
 * @return (String) The content of the field
 * @throws Exception 
 */
public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}

From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java

private String transformation(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int s = 0;//from w  ww .j  av  a 2 s.c om
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                //                    System.out.println(k);
            }
        }
    }
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    return q;
}

From source file:nicta.com.au.failureanalysis.query.QueryGneration.java

private Map<String, Integer> getTerms(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    Map<String, Integer> qterm_freq = new HashMap<>();

    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int s = 0;//from   ww w.  ja  v  a 2  s. c om
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    //        int count = 0;
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                q += k + "^" + m.get(k) + " ";
                qterm_freq.put(k, m.get(k));
                //                    count++;
                //                    System.out.println(count + " " + k + " " + m.get(k));
            }
        }
    }
    //        System.out.println("-------------------");
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    //        return q;
    return qterm_freq;
}

From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java

private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception {
    Map<String, Double> m = new HashMap<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int i = 0;//from  ww  w . j a v  a 2  s. co  m
    while (ts.incrementToken()) {
        i++;
        String term = charTermAttribute.toString();
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1.0);
        }
    }
    for (String key : m.keySet()) {
        Term t = new Term(field, key);
        int totalTF = ir.docFreq(t);
        int docs = ir.getDocCount("claims");
        double idf = Math.log10((double) docs / (totalTF + 1));
        m.put(key, (m.get(key) / i) * idf);
    }

    return m;
}

From source file:nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java

private Map<String, Integer> transformation(TokenStream ts) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString();
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {//from w  ww  . j  a  v a  2s  .  c o  m
            m.put(term, 1);
        }
    }
    return m;
}

From source file:nicta.com.au.patent.pac.analysis.RecallAnalysis.java

private Set<String> transformation(TokenStream ts) throws IOException {
    Set<String> out = new HashSet<>();
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString();
        out.add(term);/*www.j a v  a2s .c o m*/
    }
    return out;
}

From source file:nicta.com.au.patent.pac.search.PatentQuery.java

private String transformation(TokenStream ts, int treshold, String field) throws IOException {
    Map<String, Integer> m = new HashMap<>();
    String q = "";
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int s = 0;/*w  w w  .  j a va  2 s.  c  o  m*/
    while (ts.incrementToken()) {
        String term = charTermAttribute.toString().replace(":", "\\:");
        q += term + " ";
        if (m.containsKey(term)) {
            m.put(term, m.get(term) + 1);
        } else {
            m.put(term, 1);
        }
        s++;
    }
    ts.close();
    //        return q;
    q = "";
    for (String k : m.keySet()) {
        if (m.get(k) >= treshold) {
            if (!Functions.isNumeric(k)) {
                //                    q += k + "^" + m.get(k) + " ";
                q += k + "^" + 1/*m.get(k)*/ + " ";
                //                    System.out.println(k);
            }
        }
    }
    if (field != null) {
        vocabulary.put(field, m);
    }
    fieldsSize.put(field, s);
    return q;
}

From source file:nl.b3p.viewer.stripes.CatalogSearchActionBean.java

License:Open Source License

private static Or createOrFilter(String queryString, String propertyName) {
    List orList = new ArrayList();
    queryString = createQueryString(queryString, false);
    if (queryString != null && !queryString.trim().equals(defaultWildCard)) {

        propertyName = createPropertyName(propertyName);

        PropertyIsEqualTo propertyIsEqualTo = FilterCreator.createPropertyIsEqualTo(queryString, propertyName);

        StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_45,
                DutchAnalyzer.getDefaultStopSet());

        orList.add(propertyIsEqualTo);//  w w  w.ja v  a  2 s  . c om
        try {

            TokenStream tokenStream = standardAnalyzer.tokenStream("", queryString);
            OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
            CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                int startOffset = offsetAttribute.startOffset();
                int endOffset = offsetAttribute.endOffset();
                String term = charTermAttribute.toString();
                PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(term, propertyName);
                orList.add(propertyIsLike);
            }
            tokenStream.close();
        } catch (IOException e) {
            PropertyIsLike propertyIsLike = FilterCreator.createPropertyIsLike(queryString, propertyName);
            orList.add(propertyIsLike);
        }
    }

    Or or = new Or(new BinaryLogicOpType(orList));

    return or;
}

From source file:nl.cwi.helpers.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param minLength the minimum length of the ngrams
 * @param maxLength the maximum length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 */// w ww.j ava  2s .com
public void extract(String text, int minLength, int maxLength, Boolean stopWords)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.minLength = minLength;
    this.maxLength = maxLength;
    this.stopWords = stopWords;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if ((minLength == 1) && (maxLength == 1)) {
        if (this.stopWords) {
            analyzer = new StopAnalyzer(Version.LUCENE_43);
        } else {
            analyzer = new SimpleAnalyzer(Version.LUCENE_43);
        }
    }

    else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_42), minLength, maxLength,
                    " ", false, false);
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    tokenStream.reset();
    //System.out.println("So this is:" + charTermAttribute.toString() );

    while (tokenStream.incrementToken()) {
        //System.out.println("Lets see");
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

    }

    //Store unique nGrams and frequencies in hash tables

    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}