nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java Source code

Java tutorial

Introduction

Here is the source code for nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java

Source

package nicta.com.au.patent.pac.analysis;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import nicta.com.au.patent.document.Claim;
import nicta.com.au.patent.document.Claims;
import nicta.com.au.patent.document.ClassificationIpcr;
import nicta.com.au.patent.document.InventionTitle;
import nicta.com.au.patent.document.P;
import nicta.com.au.patent.document.PatentDocument;
import nicta.com.au.patent.document.PatentsStopWords;
import nicta.com.au.patent.pac.index.TermFreqVector;
import nicta.com.au.patent.pac.search.PatentQuery;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
/**
 *
 * @author rbouadjenek
 */
public final class FieldsJaccardSimilarities {

    private final Map<String, Integer>[] ptAppicationFieldsVectors;
    private final Map<String, Integer> pt2FieldsVectors = new HashMap<>();
    private final boolean specificStopWords;
    private final IndexSearcher is;
    private final double[] fieldsJaccardSimilarity = new double[5];

    public FieldsJaccardSimilarities(String indexDir, String PatentApplication, String GrantedPatent,
            boolean specificStopWords) throws IOException, ParseException {
        Directory dir = FSDirectory.open(new File(indexDir));
        is = new IndexSearcher(DirectoryReader.open(dir));
        this.specificStopWords = specificStopWords;
        PatentDocument pt1 = new PatentDocument(PatentApplication);
        ptAppicationFieldsVectors = this.parse(pt1);
        initialize(GrantedPatent);
    }

    public FieldsJaccardSimilarities(IndexSearcher is, PatentDocument PatentApplication, String GrantedPatent,
            boolean specificStopWords) throws IOException {
        this.is = is;
        this.specificStopWords = specificStopWords;
        PatentDocument pt1 = PatentApplication;
        ptAppicationFieldsVectors = this.parse(pt1);
        initialize(GrantedPatent);
    }

    public void initialize(String GrantedPatent) throws IOException {
        Query query = new TermQuery(new Term(PatentDocument.FileName, GrantedPatent));
        TopDocs hits = is.search(query, 1);
        Set<TermFreqVector> docsTerms = new HashSet<>();
        for (ScoreDoc scoreDoc : hits.scoreDocs) {

            //            Document doc = is.doc(scoreDoc.doc);
            //            System.out.println(doc.get(PatentDocument.FileName).substring(3)+"\t" + scoreDoc.score);
            Terms termTitle = is.getIndexReader().getTermVector(scoreDoc.doc, PatentQuery.getFields()[1]);
            TermFreqVector docTermsTitle = new TermFreqVector(termTitle);
            docsTerms.add(docTermsTitle);

            Terms termAbstract = is.getIndexReader().getTermVector(scoreDoc.doc, PatentQuery.getFields()[2]);
            TermFreqVector docTermsAbstract = new TermFreqVector(termAbstract);
            docsTerms.add(docTermsAbstract);

            Terms termDescription = is.getIndexReader().getTermVector(scoreDoc.doc, PatentQuery.getFields()[3]);
            TermFreqVector docTermsDescription = new TermFreqVector(termDescription);
            docsTerms.add(docTermsDescription);

            Terms termClaims = is.getIndexReader().getTermVector(scoreDoc.doc, PatentQuery.getFields()[5]);
            TermFreqVector docTermsClaims = new TermFreqVector(termClaims);
            docsTerms.add(docTermsClaims);
        }
        for (TermFreqVector tfv : docsTerms) {
            for (String term : tfv.getTerms()) {
                if (!pt2FieldsVectors.containsKey(term)) {
                    pt2FieldsVectors.put(term, tfv.getFreq(term));
                } else {
                    int v = pt2FieldsVectors.get(term);
                    pt2FieldsVectors.put(term, tfv.getFreq(term) + v);
                }
            }
        }
    }

    public final Map<String, Integer>[] parse(PatentDocument pt) throws IOException {
        Map<String, Integer>[] out = new Map[5];
        String[] ptFields = new String[5];
        String title = "";
        String ipc = "";
        String abstrac = "";
        String description = "";
        String claims = "";
        for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) {
            if (inventionTitle.getLang().toLowerCase().equals("en")) {
                title = inventionTitle.getContent();
            }
        }
        Map<String, Integer> m1 = new HashMap<>();
        for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) {
            StringTokenizer st = new StringTokenizer(ipcCode.getContent());
            m1.put(st.nextToken(), 1);
        }

        if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) {
            abstrac = pt.getAbstrac().getContent();
        }
        if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) {
            for (P p : pt.getDescription().getP()) {
                description += p.getContent() + " ";
            }
        }
        for (Claims cs : pt.getClaims()) {
            if (cs.getLang().toLowerCase().equals("en")) {
                for (Claim claim : cs.getClaim()) {
                    claims += claim.getClaimText() + " ";
                }
            }
        }
        ptFields[0] = title;
        ptFields[1] = ipc;
        ptFields[2] = abstrac;
        ptFields[3] = description;
        ptFields[4] = claims;
        Map<String, Analyzer> analyzerPerField = new HashMap<>();
        if (specificStopWords == true) {
            analyzerPerField.put(PatentDocument.Title,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Abstract,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Description,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Claims,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));
        } else {
            analyzerPerField.put(PatentDocument.Title,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Abstract,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Description,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
            analyzerPerField.put(PatentDocument.Claims,
                    new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));

        }
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48),
                analyzerPerField);
        Map<String, Integer> m0 = transformation(analyzer.tokenStream(PatentDocument.Title, ptFields[0]));
        Map<String, Integer> m2 = transformation(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2]));
        Map<String, Integer> m3 = transformation(analyzer.tokenStream(PatentDocument.Description, ptFields[3]));
        Map<String, Integer> m4 = transformation(analyzer.tokenStream(PatentDocument.Claims, ptFields[4]));
        out[0] = m0;
        out[1] = m1;
        out[2] = m2;
        out[3] = m3;
        out[4] = m4;
        return out;
    }

    private Map<String, Integer> transformation(TokenStream ts) throws IOException {
        Map<String, Integer> m = new HashMap<>();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String term = charTermAttribute.toString();
            if (m.containsKey(term)) {
                m.put(term, m.get(term) + 1);
            } else {
                m.put(term, 1);
            }
        }
        return m;
    }

    public void computeJaccardSimilarities() {
        fieldsJaccardSimilarity[0] = this.jaccard(ptAppicationFieldsVectors[0], pt2FieldsVectors);
        fieldsJaccardSimilarity[1] = this.jaccard(ptAppicationFieldsVectors[1], pt2FieldsVectors);
        fieldsJaccardSimilarity[2] = this.jaccard(ptAppicationFieldsVectors[2], pt2FieldsVectors);
        fieldsJaccardSimilarity[3] = this.jaccard(ptAppicationFieldsVectors[3], pt2FieldsVectors);
        fieldsJaccardSimilarity[4] = this.jaccard(ptAppicationFieldsVectors[4], pt2FieldsVectors);
    }

    public double jaccard(Map<String, Integer> v1, Map<String, Integer> v2) {
        if (v1.isEmpty() || v2.isEmpty()) {
            return 0;
        }
        double val1 = 0, val2 = 0;
        for (String key : v1.keySet()) {
            if (v2.containsKey(key)) {
                val1 += v1.get(key) + v2.get(key);
            }
            val2 += v1.get(key);
        }
        for (String key : v2.keySet()) {
            val2 += v2.get(key);
        }
        //        System.out.println("val1/val2= " + val1 + "/" + val2 + "= " + val1 / val2);
        return val1 / val2;
    }

    public double[] getFieldsJaccardSimilarity() {
        return fieldsJaccardSimilarity;
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws ParseException {
        // TODO code application logic here
        String q = "/Volumes/Macintosh HD/Users/rbouadjenek/Documents/Patent-Project/CLEF-IP 2010/PAC_test/topics/PAC-1269_EP-1525886-A1.xml";
        String doc = "UN-EP-1215817";
        String indexDir = "/Volumes/Macintosh HD/Users/rbouadjenek/Documents/Patent-Project/Dev/indexWithoutSW-Vec-CLEF-IP2010-2.0/";

        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/rel/UN-EP-0595715.xml";//rel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/rel/UN-EP-0595716.xml";//rel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/rel/UN-EP-0620006.xml";//rel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/rel/UN-EP-0688566.xml";//rel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/rel/UN-EP-0688567.xml";//rel
        //             doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/rel/UN-EP-0688568.xml";//rel
        //              doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/rel/UN-EP-0711561.xml";//rel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0212870.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0306236.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0313380.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0369741.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0494996.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0533799.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0764047.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0803513.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-0914862.xml";//irrel
        //            doc = "/Volumes/Macintosh HD/Users/rbouadjenek/Desktop/irrel/UN-EP-1227782.xml";//irrel
        try {
            FieldsJaccardSimilarities sim = new FieldsJaccardSimilarities(indexDir, q, doc, true);
            sim.computeJaccardSimilarities();
            System.out.println("Title: " + sim.getFieldsJaccardSimilarity()[0]);
            System.out.println("Classification: " + sim.getFieldsJaccardSimilarity()[1]);
            System.out.println("Abstract: " + sim.getFieldsJaccardSimilarity()[2]);
            System.out.println("Description: " + sim.getFieldsJaccardSimilarity()[3]);
            System.out.println("Claims: " + sim.getFieldsJaccardSimilarity()[4]);

        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

}