pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java Source code

Introduction

Here is the source code for pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java
Source

/**
 *    Copyright 2013 Bernardo Lus da Silva Ferreira
    
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    
   http://www.apache.org/licenses/LICENSE-2.0
    
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */

package pt.unlfctdi.cryptosearch.core.client;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.rmi.RemoteException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;

import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;

import pt.unlfctdi.cryptosearch.cloud.data.document.CDocument;
import pt.unlfctdi.cryptosearch.cloud.data.document.PDocument;
import pt.unlfctdi.cryptosearch.cloud.data.posting.CipheredPostingList;
import pt.unlfctdi.cryptosearch.cloud.data.posting.Posting;
import pt.unlfctdi.cryptosearch.cloud.data.posting.PostingList;
import pt.unlfctdi.cryptosearch.cloud.data.posting.TermFreq;
import pt.unlfctdi.cryptosearch.cloud.data.searchCipher.WordKey;
import pt.unlfctdi.cryptosearch.cloud.search.SPIMI_WIKI;
import pt.unlfctdi.cryptosearch.cloud.storage.CloudStorageRemote;
import pt.unlfctdi.cryptosearch.core.crypto.ClientCryptoBean;
import edu.jhu.nlp.wikipedia.PageCallbackHandler;
import edu.jhu.nlp.wikipedia.WikiPage;
import edu.jhu.nlp.wikipedia.WikiXMLParser;
import edu.jhu.nlp.wikipedia.WikiXMLParserFactory;

public class ClientConnectorBeanWIKI implements ClientConnectorLocal {

    private CloudStorageRemote cloud;
    private ClientCryptoBean crypto;
    private SPIMI_WIKI search;
    private Analyzer analyzer;

    private static Logger log = Logger.getLogger(ClientConnectorBeanWIKI.class);

    public ClientConnectorBeanWIKI() {
        crypto = new ClientCryptoBean();
        search = new SPIMI_WIKI();
        analyzer = new EnglishAnalyzer(Version.LUCENE_40);
        //      analyzer = new PortugueseAnalyzer(Version.LUCENE_40);
        //      analyzer = new SpanishAnalyzer(Version.LUCENE_40);
    }

    @Override
    public List<Posting> query(String query) {
        try {
            List<Posting> finalScores = new ArrayList<Posting>(12);

            List<WordKey> cipheredWords = new LinkedList<WordKey>();
            TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
            try {
                ts.reset();
                while (ts.incrementToken()) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    if (word.length() > 0)
                        cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                }
                ts.end();
            } finally {
                ts.close();
            }
            List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
            for (CipheredPostingList cipherPostings : cipheredPostings) {
                PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

                PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
                for (TermFreq tf : tfs.getPostings())
                    postings.add(new Posting(tf.getDocId(), tf.getFreq()));
                //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
                //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

                Posting posting;
                while ((posting = postings.poll()) != null) {
                    //if (!removedDocs.containsKey(posting.getDocId())) {
                    int j = finalScores.indexOf(posting);
                    if (j == -1)
                        finalScores.add(posting);
                    else
                        finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
                }
            }
            Collections.sort(finalScores);
            if (finalScores.size() > 12)
                return finalScores.subList(0, 12);
            else
                return finalScores;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    @Override
    public void addFirstDocuments(String xmlFile) {
        WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(xmlFile);
        try {
            wxsp.setPageCallback(new PageCallbackHandler() {
                public void process(WikiPage page) {
                    if (page.isDisambiguationPage() || page.isRedirect() || page.isSpecialPage())
                        return;
                    List<WordKey> cipheredWords = new ArrayList<WordKey>();
                    try {
                        TokenStream ts = analyzer.tokenStream(null,
                                new BufferedReader(new StringReader(page.getText())));
                        try {
                            ts.reset();
                            while (ts.incrementToken()) {
                                String word = ts.getAttribute(CharTermAttribute.class).toString();
                                if (word.length() > 0)
                                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                            }
                            ts.end();
                        } finally {
                            ts.close();
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    search.addFirstDocuments(new CDocument(new WordKey(crypto.digest(page.getTitle().getBytes())),
                            cipheredWords.toArray(new WordKey[cipheredWords.size()])));
                    //store doc in the cloud
                    //            cloud.putDoc(""+i, crypto.encryptDocAES(documents[i]));
                }
            });
            wxsp.parse();
            search.buildIndex();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override
    public PDocument getDocument(String title) {
        return getDocumentById(search.getDocumentId(title));
    }

    @Override
    public PDocument getDocumentById(int id) {
        try {
            return crypto.decryptDocAES(cloud.getDoc("" + id));
        } catch (RemoteException e) {
            e.printStackTrace();
            return null;
        }
    }

    @Override
    public boolean removeDocument(String title) {
        //      return search.removeDocument(crypto.digest(title.getBytes()));
        return false;
    }

    @Override
    public boolean removeDocumentById(int id) {
        return search.removeDocumentById(id);
    }

    @Override
    public void addDocument(PDocument document) {
        //      ClassicAnalyzer analyzer = new ClassicAnalyzer(Version.LUCENE_35, new StopWords().getM_Words());
        //      TokenStream tokenizer = analyzer.tokenStream(null, new BufferedReader(new StringReader(document.getContent())));
        //      List<WordKey> cipheredWords = new LinkedList<WordKey>();
        //      try {
        //         while (tokenizer.incrementToken()) {
        //            String word = tokenizer.getAttribute(CharTermAttribute.class).toString();
        //            if (word.length() > 0 ) {
        //               Stemmer s = new Stemmer();
        //               s.add(word.toCharArray(), word.length());
        //               s.stem();
        //               cipheredWords.add(new WordKey(crypto.encryptSearch(s.toString())));
        //            }
        //         }
        //      } catch (IOException e) {
        //         e.printStackTrace();
        //      }
        //      CDocument cDoc = new CDocument(new WordKey(crypto.digest(document.getTitle().getBytes())),
        //            cipheredWords.toArray(new WordKey[cipheredWords.size()]));
        //      
        //      Integer i = search.addDocumentToIndex(cDoc);
        //      if (i != null)
        //         cloud.putDoc(""+i, crypto.encryptDocAES(document));
    }

    @Override
    public void rebuildIndex() {
        search.buildIndex();
    }

    public static void main(String[] args) {
        log.addAppender(new ConsoleAppender(new PatternLayout("%d{ABSOLUTE} %-5p [%c{1}] %m%n")));
        log.setLevel(Level.INFO);
        ClientConnectorLocal client = new ClientConnectorBeanWIKI();

        log.info("Setup done. Building index...");
        //      client.addFirstDocuments("/home/bernardo/Desktop/enwiki-latest-pages-articles.xml");
        //      client.addFirstDocuments("/home/bernardo/Desktop/ptwiki-20121027-pages-articles.xml");
        //      client.addFirstDocuments("/home/bernardo/Desktop/eswiki-20121018-pages-articles.xml");
        //      log.info("Index Finished!");
        client.rebuildIndex();
        log.info("Starting query...");
        query(client, "IETF payload optimize retransmit threshold RFC protocol  trusted network address");
        //      query(client, "IETF payload optimize retransmit threshold");
        //      query(client, "IETF payload");
        log.info("Query finished!");
    }

    private static void query(ClientConnectorLocal client, String q) {
        List<Posting> postings = client.query(q);
        if (postings == null || postings.size() == 0)
            log.info("No match found for the query!");
        else
            for (int i = 0; i < postings.size(); i++)
                log.info(postings.get(i).getDocId() + " " + postings.get(i).getScore());
    }
}