Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * //  w w  w.  j av  a  2 s .com
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}

From source file:snu.controladores.indexador.Parser.java

/**
 * Realiza a tokenizao de uma string (Pega as palavras com split e extrai
 * seu radical)//from   ww w  . j a  v a 2 s  .c o  m
 *
 * @param analyzer
 * @param string
 * @return
 * @throws IOException
 */
private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
    stream.reset();

    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:snu.controladores.indexador.ProcessadorDeConsultas.java

/**
 * Realiza a tokenizao de uma string (Pega as palavras com split e extrai
 * seu radical)/*from   w ww. j a v a 2 s.  c om*/
 *
 * @param analyzer
 * @param string
 * @return
 * @throws IOException
 */
private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<>();

    TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
    stream.reset();
    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:StopWords.StopWords.java

public String removeStopwords(String input) {
    TokenStream tokenStream = new ClassicTokenizer(Version.LUCENE_35, new StringReader(input));
    // remove stop words
    tokenStream = new StopFilter(Version.LUCENE_35, tokenStream, EnglishAnalyzer.getDefaultStopSet());

    // retrieve the remaining tokens
    Set<String> tokens = new HashSet<String>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    String str = "";
    try {/*from  w  w w  . j  av  a  2  s . co m*/
        tokenStream.reset();
    } catch (IOException ex) {
        Logger.getLogger(StopWords.class.getName()).log(Level.SEVERE, null, ex);
    }
    try {
        while (tokenStream.incrementToken()) {
            tokens.add(token.toString());
            str += token.toString() + " ";
            //System.out.println(token.toString());
        }
    } catch (IOException e) {
        // log
    }
    return str;
}

From source file:summarizer.KeywordsGuesser.java

License:Open Source License

public static String stemmize(String term) throws IOException {

    TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(term));

    tokenStream = new PorterStemFilter(tokenStream);

    Set<String> stems = new HashSet<String>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();/* ww  w.j  av  a2  s  .c  o  m*/
    while (tokenStream.incrementToken()) {

        stems.add(token.toString());
    }

    if (stems.size() != 1) {
        return null;
    }

    String stem = stems.iterator().next();

    if (!stem.matches("[\\w-]+")) {
        return null;
    }

    return stem;
}

From source file:summarizer.KeywordsGuesser.java

License:Open Source License

public static List<Keyword> guessFromString(String input) throws IOException {

    input = input.replaceAll("-+", "-0");
    input = input.replaceAll("[\\p{Punct}&&[^'-]]+", " ");
    input = input.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(input));
    tokenStream = new LowerCaseFilter(LUCENE_VERSION, tokenStream);
    tokenStream = new ClassicFilter(tokenStream);
    tokenStream = new ASCIIFoldingFilter(tokenStream);
    tokenStream = new StopFilter(LUCENE_VERSION, tokenStream, EnglishAnalyzer.getDefaultStopSet());
    List<Keyword> keywords = new LinkedList<Keyword>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();/*  www .j a va  2s  .c o  m*/
    while (tokenStream.incrementToken()) {
        String term = token.toString();
        String stem = stemmize(term);
        if (stem != null) {
            Keyword keyword = find(keywords, new Keyword(stem.replaceAll("-0", "-")));
            keyword.add(term.replaceAll("-0", "-"));
        }
    }
    Collections.sort(keywords);
    return keywords;
}

From source file:TesterClasses.TestAnalyzer.java

public static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {/*w  w  w  .jav  a 2 s  .  com*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return result;
}

From source file:tfidf.TestTfIDF.java

License:CDDL license

public static ArrayList<String> cutWords(String line) throws IOException {

    ArrayList<String> words = new ArrayList<String>();
    //        String text = ReadFiles.readFile(file);

    IKAnalyzer analyzer = new IKAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(line));
    tokenStream.reset();//  www.  j av a  2 s  .c  o  m
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        //            System.out.println(termAttribute.toString()+"\t"+i);
        words.add(termAttribute.toString());
    }
    return words;
}

From source file:tw.com.kyle.luminance.LumPositionMap.java

public static LumPositionMap Get(String raw_text) throws IOException {
    StandardAnalyzer analyzer = new StandardAnalyzer();
    TokenStream tstream = analyzer.tokenStream("", raw_text);

    CharTermAttribute termAttr = tstream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offAttr = tstream.getAttribute(OffsetAttribute.class);
    // PositionIncrementAttribute posIncAttr = tstream.getAttribute(PositionIncrementAttribute.class);        
    // PositionLengthAttribute posLenAttr = tstream.getAttribute(PositionLengthAttribute.class);

    List<String> tokens = new ArrayList<>();
    List<Integer> pos_list = new ArrayList<>();

    int pos_counter = 0;
    tstream.reset();//  w w  w . ja v  a  2  s . c om
    while (tstream.incrementToken()) {
        tokens.add(termAttr.toString());
        pos_list.add(offAttr.startOffset());
    }

    return new LumPositionMap(tokens, pos_list);
}

From source file:tw.com.kyle.luminance.LumWindow.java

public List<LumRange> BuildLumRange(long annot_uuid) throws IOException {
    Document adoc = lum_annot.GetAnnotDocument(annot_uuid);
    if (adoc == null) {
        return new ArrayList<>();
    }/*from w  w w  .jav  a  2 s  .  c o  m*/

    int doc_id = lum_reader.getDocId(adoc);
    TokenStream tokenStream = lum_reader.GetTokenStream(doc_id, "anno");
    if (tokenStream == null) {
        return null;
    }

    OffsetAttribute offAttr = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute chAttr = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();
    List<LumRange> lr_list = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        LumRange lr = new LumRange();
        lr.data = chAttr.toString();
        lr.start_off = offAttr.startOffset();
        lr.end_off = offAttr.endOffset();
        lr_list.add(lr);
    }

    return lr_list;
}