Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKIHom.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {/*from   w w w  .  j a  va 2 s.c o m*/
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<HomPosting> homScores = search.processQuery(cipheredWords);
        for (HomPosting posting : homScores) {
            finalScores.add(new Posting(posting.getDocId(), crypto.decryptHom(posting.getScore()).intValue()));
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public List<Posting> query(String query) {
    try {/*from ww  w . java2 s .c o  m*/
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public void addFirstDocuments(File[] docs) {
    try {/* ww  w.  ja  va 2 s.  c om*/
        //         File f = new File(path);
        //         File[] docs = f.listFiles();
        for (int i = 0; i < docs.length; i++) {
            String content = Utils.readFileAsString(docs[i]);
            List<WordKey> cipheredWords = new ArrayList<WordKey>();
            TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content)));
            try {
                ts.reset();
                while (ts.incrementToken()) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    if (word.length() > 0)
                        cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                }
                ts.end();
            } finally {
                ts.close();
            }
            search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords);
            storage.putDoc("" + i,
                    crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content))));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:relevantfile.XmlParser.java

License:Open Source License

/********************************************************************************************/
public static String removeStopWordsAndStem(String input) throws IOException {
    /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue"
    ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int"
    ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super",
    "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/
    String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double",
            "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return",
            "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void",
            "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class",
            "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit",
            "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator",
            "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed",
            "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked",
            "unsafe", "ushort", "using", "virtual" };
    ArrayList<String> stopWords = new ArrayList<String>();
    for (int k = 0; k < stop_word.length; k++)
        stopWords.add(stop_word[k]);/* www .j av a  2s.  c  o  m*/
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input));
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET);
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream,
            StopFilter.makeStopSet(Version.LUCENE_46, stopWords));
    tokenStream = new PorterStemFilter(tokenStream);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        if (sb.length() > 0) {
            sb.append(" ");
        }
        sb.append(token.toString());
    }
    tokenStream.end();
    tokenStream.close();
    return sb.toString();
}

From source file:retriever.TermFreq.java

String analyze(String query) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();//w  w  w  .  j  a v a2 s  . c  o  m
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}

From source file:ri.trabri.Lucene.java

protected ArrayList<String> geraTokens(String text) throws IOException {
    TokenStream stream = this.analyzer.tokenStream(null, new StringReader(text));
    ArrayList<String> words = new ArrayList<>();

    CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        //System.out.println(cattr.toString());
        words.add(cattr.toString());//w  ww . j  a  v a2 s. c om
    }
    stream.end();
    stream.close();
    return words;
}

From source file:se.inera.intyg.webcert.web.service.diagnos.repo.DiagnosRepositoryImpl.java

License:Open Source License

@Override
public List<Diagnos> searchDiagnosisByDescription(String searchString, int nbrOfResults) {
    if (Strings.isNullOrEmpty(searchString)) {
        return Collections.emptyList();
    }/* w ww .j a v  a  2 s  . c o  m*/
    BooleanQuery query = new BooleanQuery();
    try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
        TokenStream tokenStream = analyzer.tokenStream(DESC, searchString);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = WildcardQuery.WILDCARD_STRING + charTermAttribute.toString()
                    + WildcardQuery.WILDCARD_STRING;
            query.add(new WildcardQuery(new Term(DESC, term)), BooleanClause.Occur.MUST);
        }
    } catch (IOException e) {
        throw new RuntimeException("IOException occurred in lucene index search", e);
    }
    return searchDiagnosisByQuery(query, nbrOfResults);
}

From source file:searching.QueryExpansion.java

/**
 * /*from w ww . jav a 2 s .  co  m*/
 * store frequencies of top docs in maps
 * 
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */
public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader)
        throws IOException {

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, Double> map = new TreeMap();

        Integer length = 0;
        Double f;

        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    f = map.get(term);

                    if (f == null) {
                        map.put(term, 1.0);
                    } else {
                        map.put(term, f + 1.0);
                    }
                    length++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_scores[actual_pdocs] = doc_score;

        //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob));
        actual_pdocs++;
    }
}

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * // w ww. jav  a  2 s  .  c  o  m
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}

From source file:servlets.TermStatsComparator.java

String analyze(String query) {
    StringBuffer buff = new StringBuffer();
    try {/*from   w ww . ja v  a2 s  .c o  m*/
        Analyzer analyzer = retriever.getAnalyzer();
        TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            buff.append(term);
            break;
        }
        stream.end();
        stream.close();
    } catch (Exception ex) {
        ex.printStackTrace();
        return query;
    }
    return buff.toString();
}