context.core.tokenizer.SemanticAnnotation.java Source code

Introduction

Here is the source code for context.core.tokenizer.SemanticAnnotation.java
Source

/*
     
 * Copyright (c) 2015 University of Illinois Board of Trustees, All rights reserved.   
 * Developed at GSLIS/ the iSchool, by Dr. Jana Diesner, Amirhossein Aleyasen,    
 * Chieh-Li Chin, Shubhanshu Mishra, Kiumars Soltani, and Liang Tao.     
 *   
 * This program is free software; you can redistribute it and/or modify it under   
 * the terms of the GNU General Public License as published by the Free Software   
 * Foundation; either version 2 of the License, or any later version.   
 *    
 * This program is distributed in the hope that it will be useful, but WITHOUT   
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or    
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for   
 * more details.   
 *    
 * You should have received a copy of the GNU General Public License along with   
 * this program; if not, see <http://www.gnu.org/licenses>.   
 *
     
     
 */
package context.core.tokenizer;

import context.core.util.JavaIO;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.openide.util.Exceptions;

/**
 *
 * @author Aale
 */
public class SemanticAnnotation {

    private static StanfordCoreNLP pipeline;

    static {
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, parse");
        pipeline = new StanfordCoreNLP(props);
    }

    /**
     *
     * @param text
     * @param docId
     * @return
     */
    public static Map<String, List<CustomToken>> tokenizeSPOStructure(String text, String docId) {
        Map<String, List<CustomToken>> sent_spo_map = new LinkedHashMap<>();
        Annotation document = new Annotation(text);
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
        int sentIndex = 0;
        for (CoreMap sentence : sentences) {
            System.out.println("sent-" + sentIndex + ": " + sentence);
            final List<SPOStructure> spo_list = SPOExtractor.extractSPOs(sentence, docId, sentIndex);
            for (SPOStructure spo : spo_list) {
                System.out.println(spo);
            }
            System.out.println();
            sentIndex++;
        }
        return sent_spo_map;
    }

    /**
     *
     * @param text
     * @param docId
     * @return
     */
    public static Map<String, CustomEdge> tokenizeSPO(String text, String docId) {
        System.out.println("starting tokenizeSPO...");
        Map<String, CustomEdge> customEdges = new LinkedHashMap<>();
        Annotation document = new Annotation(text);
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
        System.out.println("core annotation done, start analyzing results...");
        int sentIndex = 0;
        for (CoreMap sentence : sentences) {
            //            System.out.println("sent-" + sentIndex + ": " + sentence);
            final List<SPOStructure> spo_list = SPOExtractor.extractSPOs(sentence, docId, sentIndex);
            customEdges.putAll(generateEdges(spo_list, docId, sentIndex));
            //            for (SPOStructure spo : spo_list) {
            //                System.out.println(spo);
            //            }
            //            System.out.println();
            sentIndex++;
        }
        System.out.println(customEdges);
        System.out.println("customEdge#" + customEdges.size());
        return customEdges;
    }

    /**
     *
     * @param text
     * @param docId
     * @return
     */
    public static Map<String, CustomEdge> tokenize(String text, String docId) {
        Map<String, CustomEdge> customEdges = new LinkedHashMap<>();
        Annotation document = new Annotation(text);
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
        int sentIndex = 0;
        for (CoreMap sentence : sentences) {
            // traversing the words in the current sentence
            // a CoreLabel is a CoreMap with additional token-specific methods
            int index = 0;

            SemanticGraph dependencies = sentence
                    .get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
            //            System.out.println(dependencies);
            for (SemanticGraphEdge edge : dependencies.edgeListSorted()) {

                CustomEdge cedge = new CustomEdge();
                cedge.setDocId(docId);
                cedge.setSentenceIndex(sentIndex);
                cedge.setIndex(index);
                cedge.setWord1(removePOS(edge.getSource() + ""));
                cedge.setWord2(removePOS(edge.getTarget() + ""));
                cedge.setType(edge.getRelation() + "");
                //                System.out.println(edge + " >d: " + edge.getDependent() + " >g: " + edge.getGovernor() + " > " + edge.getRelation() + "> " + edge.getSource() + " > " + edge.getTarget() + " >w: " + edge.getWeight());
                customEdges.put(cedge.getWord1() + "/" + cedge.getWord2() + "/" + cedge.getDocId() + "/"
                        + cedge.getSentenceIndex(), cedge);
                index++;
            }

            //            Collection<TypedDependency> deps = dependencies.typedDependencies();
            //            for (TypedDependency typedDep : deps) {
            //                GrammaticalRelation reln = typedDep.reln();
            //                String type = reln.toString();
            //                System.out.println("type=" + type + " >> " + typedDep);
            //            }
            //            Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
            //            
            sentIndex++;
        }
        return customEdges;
    }

    //    static String text = "In my lab, we develop and advance computational solutions that help people to better understand the interplay and co-evolution of information and socio-technical networks.";

    /**
     *
     * @param args
     */
    public static void main(String[] args) {
        String text = null;
        try {
            text = JavaIO.readFile(new File("data\\deep-parsing\\data.txt"));
            //        String text = "Amir is a master student in Computer Science at UIUC. Shub is a PhD student in GSLIS at UIUC";
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
        }

        //        String text = "The cat eats a mouse. She goes to the university.";
        //        DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
        //        int index = 1;
        //        for (List sentence : dp) {
        //            System.out.println(index+++ " :: ");
        //            System.out.println(sentence);
        //        }
        System.out.println();
        text = text.replace('\n', ' ');
        final Map<String, List<CustomToken>> tokens = SemanticAnnotation.tokenizeSPOStructure(text, "1");
        for (String key : tokens.keySet()) {
            System.out.print(key + "\nS-P-O: ");
            String str = "";
            for (CustomToken token : tokens.get(key)) {
                if (token == null || token.getWord() == null) {
                    str += "N/A";
                } else {
                    str += token.getWord();
                }
                str += " - ";
            }
            str = str.substring(0, str.length() - 3);
            System.out.println(str);
            System.out.println();
        }
    }

    /**
     *
     * @param args
     */
    public static void main2(String[] args) {
        //        String text = "Amir is a master student in Computer Science at UIUC. Shub is a PhD student in GSLIS at UIUC";

        String text = "The cat eats a mouse. She goes to the university.";
        final Map<String, CustomEdge> tokens = SemanticAnnotation.tokenize(text, "1");
        for (String key : tokens.keySet()) {
            System.out.println(key + "\t" + tokens.get(key));
        }
    }

    /**
     *
     * @param word
     * @return
     */
    public static String removePOS(String word) {
        int lastDashIndex = word.lastIndexOf("-");
        if (lastDashIndex == -1) {
            return word;
        } else {
            return word.substring(0, lastDashIndex);
        }
    }

    private static Map<String, CustomEdge> generateEdges(List<SPOStructure> spo_list, String docId, int sentIndex) {
        Map<String, CustomEdge> customEdges = new LinkedHashMap<>();
        int index = 0;
        for (SPOStructure spo : spo_list) {
            for (CustomToken subj : spo.getSubjects()) {
                CustomEdge cedge = new CustomEdge();
                cedge.setDocId(docId);
                cedge.setSentenceIndex(sentIndex);
                cedge.setIndex(index);
                cedge.setWord1(subj.word);
                cedge.setWord2(spo.predicate.word);
                cedge.setType("SP");
                //                System.out.println(edge + " >d: " + edge.getDependent() + " >g: " + edge.getGovernor() + " > " + edge.getRelation() + "> " + edge.getSource() + " > " + edge.getTarget() + " >w: " + edge.getWeight());
                customEdges.put(cedge.getWord1() + "/" + cedge.getWord2() + "/" + cedge.getDocId() + "/"
                        + cedge.getSentenceIndex(), cedge);
                index++;
            }

            for (CustomToken obj : spo.getObjects()) {
                CustomEdge cedge = new CustomEdge();
                cedge.setDocId(docId);
                cedge.setSentenceIndex(sentIndex);
                cedge.setIndex(index);
                cedge.setWord1(spo.predicate.word);
                cedge.setWord2(obj.word);
                cedge.setType("PO");
                //                System.out.println(edge + " >d: " + edge.getDependent() + " >g: " + edge.getGovernor() + " > " + edge.getRelation() + "> " + edge.getSource() + " > " + edge.getTarget() + " >w: " + edge.getWeight());
                customEdges.put(cedge.getWord1() + "/" + cedge.getWord2() + "/" + cedge.getDocId() + "/"
                        + cedge.getSentenceIndex(), cedge);
                index++;
            }
        }

        return customEdges;
    }
}