nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java Source code

Java tutorial

Introduction

Here is the source code for nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java

Source

package nicta.com.au.patent.pac.analysis;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import nicta.com.au.patent.document.Claim;
import nicta.com.au.patent.document.Claims;
import nicta.com.au.patent.document.ClassificationIpcr;
import nicta.com.au.patent.document.InventionTitle;
import nicta.com.au.patent.document.P;
import nicta.com.au.patent.document.PatentDocument;
import nicta.com.au.patent.document.PatentsStopWords;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
/**
 *
 * @author rbouadjenek
 */
public class FieldsCosineSimilarities {

    private final Map<String, Double>[] pt1FieldsVectors;
    private final Map<String, Double>[] pt2FieldsVectors;
    private final boolean specificStopWords;
    final private IndexReader ir;
    private final double[] fieldsCosineSimilarity = new double[5];

    public FieldsCosineSimilarities(String PatentApplication, String GrantedPatend, String indexDir,
            boolean specificStopWords) throws IOException, Exception {
        this.specificStopWords = specificStopWords;
        Directory dir = FSDirectory.open(new File(indexDir));
        ir = DirectoryReader.open(dir);
        PatentDocument pt1 = new PatentDocument(PatentApplication);
        PatentDocument pt2 = new PatentDocument(GrantedPatend);
        pt1FieldsVectors = this.parse(pt1);
        pt2FieldsVectors = this.parse(pt2);

    }

    public FieldsCosineSimilarities(PatentDocument PatentApplication, String GrantedPatend, IndexReader ir,
            boolean specificStopWords) throws IOException, Exception {
        this.specificStopWords = specificStopWords;
        this.ir = ir;
        PatentDocument pt1 = PatentApplication;
        PatentDocument pt2 = new PatentDocument(GrantedPatend);
        pt1FieldsVectors = this.parse(pt1);
        pt2FieldsVectors = this.parse(pt2);

    }

    public final Map<String, Double>[] parse(PatentDocument pt) throws IOException, Exception {
        Map<String, Double>[] out = new Map[5];
        String[] ptFields = new String[5];
        String title = "";
        String ipc = "";
        String abstrac = "";
        String description = "";
        String claims = "";
        for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) {
            if (inventionTitle.getLang().toLowerCase().equals("en")) {
                title = inventionTitle.getContent();
            }
        }
        Map<String, Double> m1 = new HashMap<>();
        for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) {
            StringTokenizer st = new StringTokenizer(ipcCode.getContent());
            m1.put(st.nextToken(), 1.0);
        }

        if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) {
            abstrac = pt.getAbstrac().getContent();
        }
        if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) {
            for (P p : pt.getDescription().getP()) {
                description += p.getContent() + " ";
            }
        }
        for (Claims cs : pt.getClaims()) {
            if (cs.getLang().toLowerCase().equals("en")) {
                for (Claim claim : cs.getClaim()) {
                    claims += claim.getClaimText() + " ";
                }
            }
        }
        ptFields[0] = title;
        ptFields[1] = ipc;
        ptFields[2] = abstrac;
        ptFields[3] = description;
        ptFields[4] = claims;
        Map<String, Analyzer> analyzerPerField = new HashMap<>();

        if (specificStopWords == true) {
            analyzerPerField.put(PatentDocument.Title,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Abstract,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Description,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Claims,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));
        } else {
            analyzerPerField.put(PatentDocument.Title,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Abstract,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Description,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Claims,
                    new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));

        }

        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_44),
                analyzerPerField);
        Map<String, Double> m0 = getVector(analyzer.tokenStream(PatentDocument.Title, ptFields[0]),
                PatentDocument.Title);
        Map<String, Double> m2 = getVector(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2]),
                PatentDocument.Abstract);
        Map<String, Double> m3 = getVector(analyzer.tokenStream(PatentDocument.Description, ptFields[3]),
                PatentDocument.Description);
        Map<String, Double> m4 = getVector(analyzer.tokenStream(PatentDocument.Claims, ptFields[4]),
                PatentDocument.Claims);
        out[0] = m0;
        out[1] = m1;
        out[2] = m2;
        out[3] = m3;
        out[4] = m4;
        return out;
    }

    private Map<String, Double> getVector(TokenStream ts, String field) throws IOException, Exception {
        Map<String, Double> m = new HashMap<>();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int i = 0;
        while (ts.incrementToken()) {
            i++;
            String term = charTermAttribute.toString();
            if (m.containsKey(term)) {
                m.put(term, m.get(term) + 1);
            } else {
                m.put(term, 1.0);
            }
        }
        for (String key : m.keySet()) {
            Term t = new Term(field, key);
            int totalTF = ir.docFreq(t);
            int docs = ir.getDocCount("claims");
            double idf = Math.log10((double) docs / (totalTF + 1));
            m.put(key, (m.get(key) / i) * idf);
        }

        return m;
    }

    public double getIdf(String term) {

        return 0;
    }

    public void computeCosineSimilarities() {
        fieldsCosineSimilarity[0] = this.cosine(pt1FieldsVectors[0], pt2FieldsVectors[0]);
        fieldsCosineSimilarity[1] = this.cosine(pt1FieldsVectors[1], pt2FieldsVectors[1]);
        fieldsCosineSimilarity[2] = this.cosine(pt1FieldsVectors[2], pt2FieldsVectors[2]);
        fieldsCosineSimilarity[3] = this.cosine(pt1FieldsVectors[3], pt2FieldsVectors[3]);
        fieldsCosineSimilarity[4] = this.cosine(pt1FieldsVectors[4], pt2FieldsVectors[4]);
    }

    public double cosine(Map<String, Double> v1, Map<String, Double> v2) {
        if (v1.isEmpty() || v2.isEmpty()) {
            return 0;
        }
        double sim = 0;
        for (String term : v1.keySet()) {
            if (v2.containsKey(term)) {
                sim = sim + v2.get(term) * v1.get(term);
            }
        }
        double r = 0;
        for (Double d : v2.values()) {
            r = r + Math.pow(d, 2);
        }
        r = Math.sqrt(r);
        double q = 0;
        for (Double d : v1.values()) {
            q = q + Math.pow(d, 2);
        }
        q = Math.sqrt(q);
        if (q == 0 || r == 0) {
            return 0;
        }
        sim = sim / (q * r);
        return sim;
    }

    public double[] getFieldsCosineSimilarity() {
        return fieldsCosineSimilarity;
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        // TODO code application logic here
        String q;
        String doc;
        String indexDir;
        if (args.length == 0) {
            q = "/Volumes/Macintosh HD/Users/rbouadjenek/Documents/Patent-Project/Dev/query/PAC-132_EP-1550834-A1.xml";
            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Documents/Patent-Project/Dev/docs/UN-EP-1070700.xml";
            indexDir = "/Volumes/Macintosh HD/Users/rbouadjenek/Documents/Patent-Project/Dev/index/";
        } else {
            q = args[0];
            doc = args[1];
            indexDir = args[2];
        }
        try {
            FieldsCosineSimilarities sim = new FieldsCosineSimilarities(q, doc, indexDir, true);
            sim.computeCosineSimilarities();
            System.out.println("Title: " + sim.getFieldsCosineSimilarity()[0]);
            System.out.println("Classification: " + sim.getFieldsCosineSimilarity()[1]);
            System.out.println("Abstract: " + sim.getFieldsCosineSimilarity()[2]);
            System.out.println("Description: " + sim.getFieldsCosineSimilarity()[3]);
            System.out.println("Claims: " + sim.getFieldsCosineSimilarity()[4]);

        } catch (IOException ex) {
            ex.printStackTrace();
        } catch (Exception ex) {
            Logger.getLogger(FieldsCosineSimilarities.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

}