nmsu.cs.TFIDFVector.java Source code

Introduction

Here is the source code for nmsu.cs.TFIDFVector.java
Source

/*
Copyright (c) 2014 cs.nmsu.edu
    
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
    
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
    
The Software shall be used for Good, not Evil.
    
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
package nmsu.cs;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;

/**
 * tf idf vector class, used as baseline method.
 * @author Chuan Hu chu@cs.nmsu.edu
 */
class TFIDFVector {
    /**
     * lucene default similarity class.  Used to compute TF/IDF
     */
    static DefaultSimilarity sim = new DefaultSimilarity();

    public static List<String> vocabulary = null;

    /**
     * TF/IDF vector
     */
    private Map<String, Float> tfidfMap = null;

    private String[] terms;
    private int[] tfArr;

    private int[] dfArr;

    /**
     * Constructor
     * @param terms array of terms
     * @param tfArr array of term frequency
     * @param dfArr array of document frequency
     * @param numDocs num of total documents
     */
    TFIDFVector(String[] terms, int[] tfArr, int[] dfArr, int numDocs) {
        tfidfMap = new HashMap<String, Float>();
        this.terms = terms;
        this.tfArr = tfArr;
        this.dfArr = dfArr;

        for (int i = 0; i < terms.length; i++) {
            tfidfMap.put(terms[i], sim.tf(tfArr[i]) * sim.idf(dfArr[i], numDocs));

        }
    }

    /**
     * use TF/IDF value as term's probability and normalize all terms'
     */
    public void normalize() {
        float sum = this.sum();
        //compute sum TF/IDF value
        //normalize every term's value
        for (Map.Entry<String, Float> entry : tfidfMap.entrySet())
            entry.setValue(entry.getValue() / sum);
    }

    /**
     * 
     * @return
     */
    public float sum() {
        float sum = 0;
        //compute sum TF/IDF value

        for (int i = 0; i < tfArr.length; i++) {
            sum += tfArr[i] * tfidfMap.get(terms[i]);
        }
        //      for(Map.Entry<String, Float> entry : tfidfMap.entrySet())
        //         sum+=entry.getValue();
        return sum;
    }

    /**
     * 
     * @param factor
     */
    public void normalizedBy(float factor) {
        //normalize every term's value
        for (Map.Entry<String, Float> entry : tfidfMap.entrySet())
            entry.setValue(entry.getValue() / factor);
    }

    /**
     * get one term's probability in this vector
     * @param term
     * @return
     */
    public double getTermPro(String term) {
        Float pro = tfidfMap.get(term);
        if (pro == null)
            pro = new Float(0);
        return pro;
    }

    /**
     * Compute and return prior log likelihood of this doc
     * @return
     */
    public double priorLLH() {
        double llh = 0;

        for (int i = 0; i < tfArr.length; i++) {
            int count = tfArr[i];
            llh += (count * Math.log(tfidfMap.get(terms[i])));
        }

        return llh;
    }

    /**
     * compute posterior log llh of this doc with its cited docs, aspect cosine similarity, lambda
     * @param citedVector
     * @param aspectSim
     * @return
     */
    public double posteriorLLH(TFIDFVector[] opVector, double[] opSimArr, double lambda) {
        double llh = 0;
        for (int i = 0; i < this.tfArr.length; i++) {
            int count = tfArr[i];
            String term = terms[i];

            double citingPro = this.getTermPro(term);
            double citedPro = 0;
            for (int j = 0; j < opVector.length; j++) {
                citedPro += (opVector[j].getTermPro(term) * opSimArr[j]);
                //            if(citedPro!=0)
                //               System.out.println(term+" "+opVector.length+" "+j+" "+opSimArr[j]+" "+citingPro+" "+citedPro);
            }

            llh += count * Math.log(lambda * citingPro + (1 - lambda) * citedPro);
        }
        return llh;
    }

    /**
     * compute cosine similarity of two TFIDF vectors.
     * @param v1
     * @param v2
     * @return
     */
    public static double computeCosineSim(TFIDFVector v1, TFIDFVector v2) {
        double sim = 0, module1 = 0, module2 = 0, product = 0;

        for (String term : TFIDFVector.vocabulary) {
            float w1 = 0, w2 = 0;
            if (v1.tfidfMap.keySet().contains(term))
                w1 = v1.tfidfMap.get(term);
            if (v2.tfidfMap.keySet().contains(term))
                w2 = v2.tfidfMap.get(term);
            product += (w1 * w2);
            module1 += Math.pow(w1, 2);
            module2 += Math.pow(w2, 2);
        }
        sim = product / (Math.sqrt(module1) * Math.sqrt(module2));
        return Double.isNaN(sim) ? 0 : sim;
    }

}

/**
 * baseline method class
 */
public class BaseLineMethod {
    /**
     * command options (with all the parameters)
     */
    CmdOption cmdOption;

    /**
     * 
     */
    static DataRaw rawdata = new DataRaw();

    public BaseLineMethod(CmdOption _cmdOption) {
        cmdOption = _cmdOption;
        DataParsed parsedData = new DataParsed();
        rawdata = parsedData.initBaseLine(cmdOption.paperfolder, cmdOption.graphfile, cmdOption.aspectfile);
    }

    /**
     * build Lucene index for the text
     * @param indexDir
     */
    public void buildIndex(String indexDir) {

        //      System.out.println(Debugger.getCallerPosition()+" is RawData null "+rawdata==null);

        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, new PorterAnalyzer());
        IndexWriter w;
        try {
            w = new IndexWriter(FSDirectory.open(new File(indexDir)), conf);
            w.deleteAll();
            w.commit();

            System.out.println(rawdata.id2Docs.size());

            for (Map.Entry<Integer, Doc> entry : rawdata.id2Docs.entrySet()) {
                Document document = BaseLineMethod.convertDoc2Document(entry.getValue());
                w.addDocument(document);
            }

            w.commit();
            w.close();

        } catch (CorruptIndexException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (LockObtainFailedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * calculate likelihood from the index
      * @param indexDir
     * @param lambda
     */
    public void calLikelihoodFromIndex(String indexDir, double lambda) {
        try {
            IndexReader ir = IndexReader.open(FSDirectory.open(new File(indexDir)));
            IndexSearcher is = new IndexSearcher(ir);
            int numDocs = ir.maxDoc();

            double LLH = 0;

            //vocabulary list
            List<String> vocab = new ArrayList<String>();

            TermEnum te = ir.terms();
            //create vocabulary
            while (te.next()) {
                String term = te.term().text();
                //            System.out.println(term);
                vocab.add(term);
            }
            TFIDFVector.vocabulary = vocab;

            //dataset id to index id
            Map<Integer, Integer> idMap = new HashMap<Integer, Integer>();

            for (int i = 0; i < numDocs; i++) {
                Document doc = ir.document(i);
                idMap.put(Integer.parseInt(doc.get("docid")), i);
            }

            //o -> a -> o'
            Map<Integer, Map<Integer, Map<Integer, Double>>> cosineSimMap = new HashMap<Integer, Map<Integer, Map<Integer, Double>>>();
            // (o | o') dataset id -> tfidf vector
            Map<Integer, TFIDFVector> docVectorMap = new HashMap<Integer, TFIDFVector>();
            // o -> a -> vector
            Map<Integer, Map<Integer, TFIDFVector>> docAspectVectorMap = new HashMap<Integer, Map<Integer, TFIDFVector>>();

            Set<Integer> citedSet = new HashSet<Integer>();
            //for all citing document
            for (Map.Entry<Integer, List<Integer>> entry : rawdata.pubId2CiteIds.entrySet()) {//llh for citing documents
                int citingDatasetID = entry.getKey();
                int citingIndexID = idMap.get(citingDatasetID);

                //set up citing document vector
                TFIDFVector citingVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir, citingDatasetID,
                        citingIndexID, numDocs);
                float sum = citingVector.sum();

                //            System.out.println(Debugger.getCallerPosition()+" "+citingDatasetID);

                List<Integer> refList = entry.getValue();
                //for all aspects
                for (Integer aspectID : rawdata.id2Aspect.keySet()) {
                    String aspect = rawdata.id2Aspect.get(aspectID);
                    //set up citing document aspect vector
                    double aspectSim = 0;
                    if (rawdata.id2Docs.get(citingDatasetID).getText().get(aspectID).length() != 0) {
                        TFIDFVector citingAspectVector = BaseLineMethod.getAspectTFIDFVector(docAspectVectorMap, ir,
                                citingDatasetID, citingIndexID, aspectID, numDocs);
                        citingAspectVector.normalizedBy(sum);

                        int refSize = refList.size();
                        TFIDFVector[] citedVectors = new TFIDFVector[refSize];
                        double[] cosineSims = new double[refSize];
                        int count = 0;

                        //for all cited documents of this citing document
                        for (Integer citedDatasetID : refList) {
                            citedSet.add(citedDatasetID);
                            //set up cited document vector
                            int citedIndexID = idMap.get(citedDatasetID);
                            TFIDFVector citedVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir,
                                    citedDatasetID, citedIndexID, numDocs);
                            citedVector.normalize();

                            aspectSim = TFIDFVector.computeCosineSim(citedVector, citingAspectVector);
                            //                     System.out.println(Debugger.getCallerPosition()+"\t\t"+aspectSim);
                            System.out.println(
                                    citingDatasetID + "\t" + aspectID + "\t" + citedDatasetID + "\t" + aspectSim);
                            citedVectors[count] = citedVector;
                            cosineSims[count] = aspectSim;
                            count++;
                        }
                        double aspectLLH = citingAspectVector.posteriorLLH(citedVectors, cosineSims, lambda);
                        LLH += aspectLLH;
                    }
                    //                  Util.update3Map(cosineSimMap, citingDatasetID, aspectID, citedDatasetID, aspectSim);
                }
            }

            for (Integer citedDatasetID : citedSet) {
                int citedIndexID = idMap.get(citedDatasetID);
                TFIDFVector citedVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir, citedDatasetID,
                        citedIndexID, numDocs);
                citedVector.normalize();
                LLH += citedVector.priorLLH();
            }

            System.out.println(LLH);
            is.close();
            ir.close();

        } catch (CorruptIndexException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * read in text tfidfvector to docVectorMap.
     * @param docVectorMap
     * @param ir
     * @param datasetID
     * @param indexID
     * @param numDocs
     * @return
     */
    public static TFIDFVector getFullTextTFIDFVector(Map<Integer, TFIDFVector> docVectorMap, IndexReader ir,
            int datasetID, int indexID, int numDocs) {
        TFIDFVector vector = null;
        try {
            if ((vector = docVectorMap.get(datasetID)) == null) {
                TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, "fulltext");
                int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies();
                String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms();

                int[] df = new int[tf.length];
                for (int j = 0; j < terms.length; j++)
                    df[j] = ir.docFreq(new Term(terms[j]));

                vector = new TFIDFVector(terms, tf, df, numDocs);
                //            docVectorMap.put(datasetID, vector);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        return vector;
    }

    /**
     * read in aspect tfidfvector to docVectorMap.
     * @param docAspectVectorMap
     * @param ir
     * @param datasetID
     * @param indexID
     * @param aspectID
     * @param numDocs
     * @return
     */
    public static TFIDFVector getAspectTFIDFVector(Map<Integer, Map<Integer, TFIDFVector>> docAspectVectorMap,
            IndexReader ir, int datasetID, int indexID, int aspectID, int numDocs) {
        TFIDFVector vector = null;
        try {
            Map<Integer, TFIDFVector> aspectVectorMap = docAspectVectorMap.get(datasetID);
            if (aspectVectorMap == null) {
                aspectVectorMap = new HashMap<Integer, TFIDFVector>();
                docAspectVectorMap.put(datasetID, aspectVectorMap);

                TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, rawdata.id2Aspect.get(aspectID));
                int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies();
                String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms();

                int[] df = new int[tf.length];
                for (int j = 0; j < terms.length; j++)
                    df[j] = ir.docFreq(new Term(terms[j]));

                vector = new TFIDFVector(terms, tf, df, numDocs);
                //            aspectVectorMap.put(aspectID, vector);
            } else if ((vector = aspectVectorMap.get(aspectID)) == null) {
                TermFreqVector termFreqVector = ir.getTermFreqVector(indexID, rawdata.id2Aspect.get(aspectID));
                int[] tf = termFreqVector == null ? new int[0] : termFreqVector.getTermFrequencies();
                String[] terms = termFreqVector == null ? new String[0] : termFreqVector.getTerms();

                int[] df = new int[tf.length];
                for (int j = 0; j < terms.length; j++)
                    df[j] = ir.docFreq(new Term(terms[j]));

                vector = new TFIDFVector(terms, tf, df, numDocs);
                //               aspectVectorMap.put(aspectID, vector);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        return vector;
    }

    /**
     * convert the Doc object to Lucene Document object
     * @param doc
     * @return
     */
    public static Document convertDoc2Document(Doc doc) {
        Document document = new Document();

        document.add(new Field("docid", String.valueOf(doc.getId()), Field.Store.YES, Field.Index.ANALYZED,
                Field.TermVector.YES));

        for (int i = 0; i < rawdata.id2Aspect.size(); i++) {
            String aspect = rawdata.id2Aspect.get(i);
            String text = doc.getText().get(i);

            //         System.out.println(aspect+"\t"+text);x

            //remove url
            String t1 = text.replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]",
                    "");
            //remove non-ascii
            String t2 = t1.replaceAll("[^\\x00-\\x7F]", "");

            Field aspectText = new Field(aspect, t2, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
            document.add(aspectText);
        }

        Field fullText = new Field("fulltext",
                doc.getFullText()
                        .replaceAll("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", "")
                        .replaceAll("[^\\x00-\\x7F]", ""),
                Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
        document.add(fullText);

        //      System.out.println(doc.getId()+" "+doc.getFullText());

        return document;
    }

    /**
     * Analyser that remove stop words and stem words.
     * @author chu
     */
    public static class PorterAnalyzer extends Analyzer implements Serializable {
        public final TokenStream tokenStream(String fieldName, Reader reader) {
            //                return new StopFilter(new LowerCaseTokenizer(reader), StopAnalyzer.ENGLISH_STOP_WORDS);
            // we do not do porter stemming anymore.
            //               return new StopFilter(new PorterStemFilter(new LowerCaseTokenizer(reader)),
            //                       StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            TokenStream ts = new StopFilter(Version.LUCENE_36,
                    new PorterStemFilter(new LowerCaseTokenizer(Version.LUCENE_36, reader)),
                    StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            return ts;
        }
    }

    public static void main(String[] args) {
        CmdOption option = new CmdOption();
        CmdLineParser parser = new CmdLineParser(option);

        try {
            parser.parseArgument(args);
        } catch (CmdLineException e) {
            // TODO Auto-generated catch block
            System.out.println(Debugger.getCallerPosition() + "Command line error: " + e.getMessage());
            e.printStackTrace();
            return;
        }

        BaseLineMethod blm = new BaseLineMethod(option);
        //      blm.buildIndex("./data/index/"+option.SAMPLER_ID);
        blm.calLikelihoodFromIndex("./data/index/" + option.SAMPLER_ID, option.lambda);
    }
}