Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public void addFirstDocuments(File[] docs) {
    try {/*from   w  ww .j  a  va2s .c  o m*/
        //         File f = new File(path);
        //         File[] docs = f.listFiles();
        for (int i = 0; i < docs.length; i++) {
            String content = Utils.readFileAsString(docs[i]);
            List<WordKey> cipheredWords = new ArrayList<WordKey>();
            TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content)));
            try {
                ts.reset();
                while (ts.incrementToken()) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    if (word.length() > 0)
                        cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                }
                ts.end();
            } finally {
                ts.close();
            }
            search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords);
            storage.putDoc("" + i,
                    crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content))));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:relevantfile.XmlParser.java

License:Open Source License

/********************************************************************************************/
public static String removeStopWordsAndStem(String input) throws IOException {
    /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue"
    ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int"
    ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super",
    "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/
    String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double",
            "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return",
            "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void",
            "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class",
            "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit",
            "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator",
            "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed",
            "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked",
            "unsafe", "ushort", "using", "virtual" };
    ArrayList<String> stopWords = new ArrayList<String>();
    for (int k = 0; k < stop_word.length; k++)
        stopWords.add(stop_word[k]);/*from   w  w  w  . j a  v a2 s .  c om*/
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input));
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET);
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream,
            StopFilter.makeStopSet(Version.LUCENE_46, stopWords));
    tokenStream = new PorterStemFilter(tokenStream);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        if (sb.length() > 0) {
            sb.append(" ");
        }
        sb.append(token.toString());
    }
    tokenStream.end();
    tokenStream.close();
    return sb.toString();
}

From source file:retriever.TermFreq.java

String analyze(String query) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//w ww. j  ava  2s . com
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}

From source file:ri.trabri.Lucene.java

protected ArrayList<String> geraTokens(String text) throws IOException {
    TokenStream stream = this.analyzer.tokenStream(null, new StringReader(text));
    ArrayList<String> words = new ArrayList<>();

    CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//  ww w.  ja va  2 s.  co m
    while (stream.incrementToken()) {
        //System.out.println(cattr.toString());
        words.add(cattr.toString());
    }
    stream.end();
    stream.close();
    return words;
}

From source file:searching.QueryExpansion.java

/**
 * /*from  ww w. j  ava 2 s  .  c o  m*/
 * store frequencies of top docs in maps
 * 
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */
public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader)
        throws IOException {

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, Double> map = new TreeMap();

        Integer length = 0;
        Double f;

        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    f = map.get(term);

                    if (f == null) {
                        map.put(term, 1.0);
                    } else {
                        map.put(term, f + 1.0);
                    }
                    length++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_scores[actual_pdocs] = doc_score;

        //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob));
        actual_pdocs++;
    }
}

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * //from w w w  .  ja  v  a  2 s .  c  om
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}

From source file:servlets.TermStatsComparator.java

String analyze(String query) {
    StringBuffer buff = new StringBuffer();
    try {/* w  w  w  .ja v  a2s .c  o m*/
        Analyzer analyzer = retriever.getAnalyzer();
        TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            buff.append(term);
            break;
        }
        stream.end();
        stream.close();
    } catch (Exception ex) {
        ex.printStackTrace();
        return query;
    }
    return buff.toString();
}

From source file:stackoverflow.lucene.modified.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///from   w  ww . j av  a 2  s  .c om
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:test.AnalzyerDemo.java

License:Apache License

public static void main(String[] args) {
    Analyzer analyzer = new BaseAnalyzer();
    // Analyzer analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer();
    // ?LuceneTokenStream
    TokenStream ts = null;
    try {/*  w  w w . j a  va  2  s. co m*/
        ts = analyzer.tokenStream("myfield", new StringReader(
                "????????????????2?3noneok???BaseAnalyer can analysis english text too"));
        // ???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        // ??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        // ??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);
        // ?TokenStream?StringReader
        ts.reset();
        // ??
        while (ts.incrementToken()) {
            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        // TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
    } catch (IOException e) {
        e.printStackTrace();
        analyzer.close();
    } finally {
        // TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:tweetembeding.AnalyzerClass.java

public String analizeString(String FIELD, String txt) throws IOException {
    this.analyzer = setAnalyzer();
    TokenStream stream = analyzer.tokenStream(FIELD, new StringReader(txt));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from ww w.j av  a 2s  .com*/

    StringBuffer tokenizedContentBuff = new StringBuffer();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        if (!term.equals("nbsp"))
            tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    return tokenizedContentBuff.toString();
}