Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {//from  w w w  .j a  v a 2s  .co m
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public void addFirstDocuments(String xmlFile) {
    WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(xmlFile);
    try {// www  .  j  av a 2s.co  m
        wxsp.setPageCallback(new PageCallbackHandler() {
            public void process(WikiPage page) {
                if (page.isDisambiguationPage() || page.isRedirect() || page.isSpecialPage())
                    return;
                List<WordKey> cipheredWords = new ArrayList<WordKey>();
                try {
                    TokenStream ts = analyzer.tokenStream(null,
                            new BufferedReader(new StringReader(page.getText())));
                    try {
                        ts.reset();
                        while (ts.incrementToken()) {
                            String word = ts.getAttribute(CharTermAttribute.class).toString();
                            if (word.length() > 0)
                                cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                        }
                        ts.end();
                    } finally {
                        ts.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                search.addFirstDocuments(new CDocument(new WordKey(crypto.digest(page.getTitle().getBytes())),
                        cipheredWords.toArray(new WordKey[cipheredWords.size()])));
                //store doc in the cloud
                //            cloud.putDoc(""+i, crypto.encryptDocAES(documents[i]));
            }
        });
        wxsp.parse();
        search.buildIndex();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKIHom.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {/*  w w w. j  av a2s.c  o  m*/
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<HomPosting> homScores = search.processQuery(cipheredWords);
        for (HomPosting posting : homScores) {
            finalScores.add(new Posting(posting.getDocId(), crypto.decryptHom(posting.getScore()).intValue()));
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public List<Posting> query(String query) {
    try {/*  www .  j  a v  a2  s. c  om*/
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public void addFirstDocuments(File[] docs) {
    try {/*from   www.j a v a  2 s .co m*/
        //         File f = new File(path);
        //         File[] docs = f.listFiles();
        for (int i = 0; i < docs.length; i++) {
            String content = Utils.readFileAsString(docs[i]);
            List<WordKey> cipheredWords = new ArrayList<WordKey>();
            TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content)));
            try {
                ts.reset();
                while (ts.incrementToken()) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    if (word.length() > 0)
                        cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                }
                ts.end();
            } finally {
                ts.close();
            }
            search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords);
            storage.putDoc("" + i,
                    crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content))));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:relevantfile.XmlParser.java

License:Open Source License

/********************************************************************************************/
public static String removeStopWordsAndStem(String input) throws IOException {
    /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue"
    ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int"
    ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super",
    "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/
    String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double",
            "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return",
            "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void",
            "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class",
            "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit",
            "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator",
            "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed",
            "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked",
            "unsafe", "ushort", "using", "virtual" };
    ArrayList<String> stopWords = new ArrayList<String>();
    for (int k = 0; k < stop_word.length; k++)
        stopWords.add(stop_word[k]);//from   w w w.  jav a 2s.  c o  m
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input));
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET);
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream,
            StopFilter.makeStopSet(Version.LUCENE_46, stopWords));
    tokenStream = new PorterStemFilter(tokenStream);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        if (sb.length() > 0) {
            sb.append(" ");
        }
        sb.append(token.toString());
    }
    tokenStream.end();
    tokenStream.close();
    return sb.toString();
}

From source file:retriever.TermFreq.java

String analyze(String query) throws Exception {
    StringBuffer buff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//  w  w w .ja v a  2  s. com
    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();
        buff.append(term).append(" ");
    }
    stream.end();
    stream.close();
    return buff.toString();
}

From source file:ri.trabri.Lucene.java

protected ArrayList<String> geraTokens(String text) throws IOException {
    TokenStream stream = this.analyzer.tokenStream(null, new StringReader(text));
    ArrayList<String> words = new ArrayList<>();

    CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from   ww w.  j  a v a2s.c o  m
    while (stream.incrementToken()) {
        //System.out.println(cattr.toString());
        words.add(cattr.toString());
    }
    stream.end();
    stream.close();
    return words;
}

From source file:searching.QueryExpansion.java

/**
 * /*from   w w  w . j a v a2 s  .co m*/
 * store frequencies of top docs in maps
 * 
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */
public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader)
        throws IOException {

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, Double> map = new TreeMap();

        Integer length = 0;
        Double f;

        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    f = map.get(term);

                    if (f == null) {
                        map.put(term, 1.0);
                    } else {
                        map.put(term, f + 1.0);
                    }
                    length++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_scores[actual_pdocs] = doc_score;

        //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob));
        actual_pdocs++;
    }
}

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * /*w  ww. ja v  a2  s.  co m*/
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}