Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * //  w w  w.  j av  a  2 s .com
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}

From source file:snu.controladores.indexador.Parser.java

/**
 * Realiza a tokenizao de uma string (Pega as palavras com split e extrai
 * seu radical)//from   ww w  . j a  v a 2 s  .c o  m
 *
 * @param analyzer
 * @param string
 * @return
 * @throws IOException
 */
private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
    stream.reset();

    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:snu.controladores.indexador.ProcessadorDeConsultas.java

/**
 * Realiza a tokenizao de uma string (Pega as palavras com split e extrai
 * seu radical)/*from   w ww. j a v a 2 s.  c om*/
 *
 * @param analyzer
 * @param string
 * @return
 * @throws IOException
 */
private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<>();

    TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
    stream.reset();
    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:StopWords.StopWords.java

public String removeStopwords(String input) {
    TokenStream tokenStream = new ClassicTokenizer(Version.LUCENE_35, new StringReader(input));
    // remove stop words
    tokenStream = new StopFilter(Version.LUCENE_35, tokenStream, EnglishAnalyzer.getDefaultStopSet());

    // retrieve the remaining tokens
    Set<String> tokens = new HashSet<String>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    String str = "";
    try {/*from  w  w w  . j  av  a  2  s . co m*/
        tokenStream.reset();
    } catch (IOException ex) {
        Logger.getLogger(StopWords.class.getName()).log(Level.SEVERE, null, ex);
    }
    try {
        while (tokenStream.incrementToken()) {
            tokens.add(token.toString());
            str += token.toString() + " ";
            //System.out.println(token.toString());
        }
    } catch (IOException e) {
        // log
    }
    return str;
}

From source file:summarizer.KeywordsGuesser.java

License:Open Source License

public static String stemmize(String term) throws IOException {

    TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(term));

    tokenStream = new PorterStemFilter(tokenStream);

    Set<String> stems = new HashSet<String>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();/* ww  w.j  av  a2  s  .c  o  m*/
    while (tokenStream.incrementToken()) {

        stems.add(token.toString());
    }

    if (stems.size() != 1) {
        return null;
    }

    String stem = stems.iterator().next();

    if (!stem.matches("[\\w-]+")) {
        return null;
    }

    return stem;
}

From source file:summarizer.KeywordsGuesser.java

License:Open Source License

public static List<Keyword> guessFromString(String input) throws IOException {

    input = input.replaceAll("-+", "-0");
    input = input.replaceAll("[\\p{Punct}&&[^'-]]+", " ");
    input = input.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(input));
    tokenStream = new LowerCaseFilter(LUCENE_VERSION, tokenStream);
    tokenStream = new ClassicFilter(tokenStream);
    tokenStream = new ASCIIFoldingFilter(tokenStream);
    tokenStream = new StopFilter(LUCENE_VERSION, tokenStream, EnglishAnalyzer.getDefaultStopSet());
    List<Keyword> keywords = new LinkedList<Keyword>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();/*  www .j a va  2s  .c o  m*/
    while (tokenStream.incrementToken()) {
        String term = token.toString();
        String stem = stemmize(term);
        if (stem != null) {
            Keyword keyword = find(keywords, new Keyword(stem.replaceAll("-0", "-")));
            keyword.add(term.replaceAll("-0", "-"));
        }
    }
    Collections.sort(keywords);
    return keywords;
}

From source file:TesterClasses.TestAnalyzer.java

public static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {/*w  w  w  .jav  a 2 s  .  com*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;
}

From source file:tfidf.TestTfIDF.java

License:CDDL license

public static ArrayList<String> cutWords(String line) throws IOException {

    ArrayList<String> words = new ArrayList<String>();
    //        String text = ReadFiles.readFile(file);

    IKAnalyzer analyzer = new IKAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(line));
    tokenStream.reset();//  www.  j av a  2 s  .c  o  m
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        //            System.out.println(termAttribute.toString()+"\t"+i);
        words.add(termAttribute.toString());
    }
    return words;
}

From source file:tw.com.kyle.luminance.LumPositionMap.java

public static LumPositionMap Get(String raw_text) throws IOException {
    StandardAnalyzer analyzer = new StandardAnalyzer();
    TokenStream tstream = analyzer.tokenStream("", raw_text);

    CharTermAttribute termAttr = tstream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offAttr = tstream.getAttribute(OffsetAttribute.class);
    // PositionIncrementAttribute posIncAttr = tstream.getAttribute(PositionIncrementAttribute.class);        
    // PositionLengthAttribute posLenAttr = tstream.getAttribute(PositionLengthAttribute.class);

    List<String> tokens = new ArrayList<>();
    List<Integer> pos_list = new ArrayList<>();

    int pos_counter = 0;
    tstream.reset();//  w w  w . ja v  a  2  s . c om
    while (tstream.incrementToken()) {
        tokens.add(termAttr.toString());
        pos_list.add(offAttr.startOffset());
    }

    return new LumPositionMap(tokens, pos_list);
}

From source file:tw.com.kyle.luminance.LumWindow.java

public List<LumRange> BuildLumRange(long annot_uuid) throws IOException {
    Document adoc = lum_annot.GetAnnotDocument(annot_uuid);
    if (adoc == null) {
        return new ArrayList<>();
    }/*from w  w w  .jav  a  2 s  .  c o  m*/

    int doc_id = lum_reader.getDocId(adoc);
    TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, "anno");
    if (tokenStream == null) {
        return null;
    }

    OffsetAttribute offAttr = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute chAttr = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();
    List<LumRange> lr_list = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        LumRange lr = new LumRange();
        lr.data = chAttr.toString();
        lr.start_off = offAttr.startOffset();
        lr.end_off = offAttr.endOffset();
        lr_list.add(lr);
    }

    return lr_list;
}