edu.cornell.law.entitylinking.utils.Utility.java Source code

Introduction

Here is the source code for edu.cornell.law.entitylinking.utils.Utility.java
Source

package edu.cornell.law.entitylinking.utils;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.StringTokenizer;

import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;

public class Utility {

    private static MaxentTagger tagger;
    private static StanfordCoreNLP pipeline;
    private static Properties props = new Properties();

    public static void runCoreNLP() {
        tagger = new MaxentTagger(
                "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse");
        pipeline = new StanfordCoreNLP(props);
    }

    /**
     * Function that adds POS Tags to a given string
     * @param paragraph
     * @return
     */
    public static String getPOSTagging(String paragraph) {
        //System.out.println(tagger.tagString(paragraph));
        return tagger.tagString(paragraph);
    }

    public static List<String> findPOSTags(String paragraph) {
        List<String> potentialEntities = new ArrayList<String>();
        String POSTaggedPara = Utility.getPOSTagging(paragraph);
        String[] taggedWords = POSTaggedPara.trim().split(" ");
        for (int i = 0; i < taggedWords.length; i++) {
            if (taggedWords[i].contains("NN") || taggedWords[i].contains("NNP") || taggedWords[i].contains("FW")) {
                potentialEntities.add(taggedWords[i].split("_")[0]);
            }
        }
        return potentialEntities;
    }

    public static List<String> getInnerNounPhrases(String paragraph) {
        List<String> nounPhrases = new ArrayList<String>();
        try {
            StringTokenizer tokenizer = new StringTokenizer(paragraph, "\\.;?,:");
            while (tokenizer.hasMoreTokens()) {
                Annotation document = new Annotation(tokenizer.nextToken());
                pipeline.annotate(document);
                Tree tree = null;
                // these are all the sentences in this document
                // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
                List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                for (CoreMap sentence : sentences) {
                    // the parse tree of the current sentence
                    tree = sentence.get(TreeAnnotation.class);

                    List<Tree> phraseList = new ArrayList<Tree>();
                    for (Tree subtree : tree) {
                        if ((subtree.label().value().equals("NP")) || (subtree.label().value().equals("WHNP"))) {
                            phraseList.add(subtree);
                        }
                    }

                    if (!phraseList.isEmpty()) {
                        String skipPhrase = "false";
                        for (Tree subList : phraseList) {
                            StringBuilder phraseString = new StringBuilder();
                            String phrase = subList.toString();
                            String[] tokens = phrase.split(" ");
                            for (String token : tokens) {
                                if (token.contains("(")) {
                                    if (token.contains("(NP")) {
                                        // Check if there are more NP or WHNP in it?
                                        String subPhrase = phrase.replaceFirst("\\(NP", "");
                                        if ((subPhrase.contains("(NP")) || (subPhrase.contains("(WHNP"))) {
                                            skipPhrase = "true";
                                            break;
                                        }
                                    } else if (token.contains("(WHNP")) {
                                        // Check if there are more NP or WHNP in it?
                                        String subPhrase = phrase.replaceFirst("\\(WHNP", "");
                                        if ((subPhrase.contains("(NP")) || (subPhrase.contains("(WHNP"))) {
                                            skipPhrase = "true";
                                            break;
                                        }
                                    } else {
                                        // do nothing, just drop the keyword.
                                    }
                                } else {
                                    token = token.replace(")", "");
                                    phraseString.append(token + " ");
                                    skipPhrase = "false";
                                }

                            }
                            if (!skipPhrase.equals("true")) {
                                String temp = phraseString.toString().trim();
                                if (temp.startsWith("(?i)the"))
                                    temp = temp.replaceFirst("(?i)the ", "");

                                else if (temp.startsWith("(?i)a"))
                                    temp = temp.replaceFirst("(?i)a ", "");

                                else if (temp.startsWith("(?i)an"))
                                    temp = temp.replaceFirst("(?i)an ", "");

                                if (temp.contains(" or ")) {
                                    String[] nptokens = temp.split(" or ");
                                    for (String s : nptokens) {
                                        nounPhrases.add(s);
                                    }
                                } else {
                                    nounPhrases.add(temp);
                                }
                            }
                        }
                    }
                }
            }
        } catch (OutOfMemoryError e) {
            System.out.println("Result too long to read into memory");
        }
        return nounPhrases;
    }

    public static List<String> getAllNounPhrases(String paragraph) {
        List<String> nounPhrases = new ArrayList<String>();
        try {
            StringTokenizer tokenizer = new StringTokenizer(paragraph, "\\.;?:,");
            while (tokenizer.hasMoreTokens()) {
                Annotation document = new Annotation(tokenizer.nextToken());
                pipeline.annotate(document);
                Tree tree = null;
                List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                for (CoreMap sentence : sentences) {
                    // this is the parse tree of the current sentence
                    tree = sentence.get(TreeAnnotation.class);

                    for (Tree subtree : tree) {
                        if ((subtree.label().value().equals("NP")) || (subtree.label().value().equals("WHNP"))) {
                            String phraseString = Sentence.listToString(subtree.yieldWords())
                                    .replace(" -LRB- ", "(").replace(" -RRB- ", ")");

                            String temp = phraseString.trim();
                            if (temp.startsWith("(?i)the"))
                                temp = temp.replaceFirst("(?i)the ", "");

                            else if (temp.startsWith("(?i)a"))
                                temp = temp.replaceFirst("(?i)a ", "");

                            else if (temp.startsWith("(?i)an"))
                                temp = temp.replaceFirst("(?i)an ", "");

                            if (subtree.getChildrenAsList().contains(tree.label().value().equals("NN"))) {
                                //System.out.println("PHRASE");
                            }

                            if (temp.contains(" or ")) {
                                String[] nptokens = temp.split(" or ");
                                for (String s : nptokens) {
                                    nounPhrases.add(s);
                                }
                            } else {
                                nounPhrases.add(temp);
                            }
                        }
                    }
                }
            }
        } catch (OutOfMemoryError e) {
            System.out.println("Result too long to read into memory");
        }

        return nounPhrases;
    }

    /**
     * 
     * @param args
     */
    /*public static void main(String args[]) {
     System.out.println("**************************");
     String test = "(b)";
     System.out.println(test.substring(test.indexOf("#")+1,test.length()));*/
    //"sodium lauryl sulfate"
    //"3-[3-(2,3-dihydroxy-propylamino)-phenyl]-4-(5-fluoro-1-methyl-1h-indol-3-yl)-pyrrole-2,5-dione";
    //"2-(Phosphonooxy)Butanoic Acid"; 
    //"intermediate handling facility hand.";
    //"United States Department of Agriculture for injection into";
    //"<i>Approved brucella vaccine.</i> A product approved by and produced under license of the United States Department of Agriculture for injection into cattle or bison to enhance their resistance to brucellosis.";
    //"brucellosis negative classification";
    //"An animal subjected to one or more official tests resulting in a brucellosis negative classification or reclassified as brucellosis negative by a designated epidemiologist as provided for in the definition of official test.";
    //"An animal subjected to an official test resulting in a brucellosis reactor or subjected to a bacteriological examination.";
    //"That portion of any State which has a separate brucellosis classification under this part.";
    //"The brucellosis reactor is located in a herd in a different State than the State where the MCI blood sample was collected.";
    //"This is a test paragraph. Cette paragraphe une teste."; //"2-(Phosphonooxy)Butanoic Acid"; 
    /*System.out.println(test);
    System.out.println("check regex.."); 
    String tagMe= "hand";
    if (test.matches(".*[\\s.,:;]+"+tagMe+"[\\s.,:;]+.*")) {
       test = test.replace(tagMe , " <linkedEntity src=\"mesh\" identifier=\"meshEntity:" 
             + "fejhf" + "\" "
              + "occur=\"1\">"
              + tagMe + "</linkedEntity> ");
       System.out.println(test);
    }
    if (test.contains(tagMe))
       System.out.println(test);*/
    /*String posTaggedTest = Utility.getPOSTagging(test);
    System.out.println("POS TAGS:   " + posTaggedTest);   */

    /*List<String> posTagged = Utility.findPOSTags(test);
        
    System.out.println("*****************************");
    for (String s :posTagged) {
         System.out.println(s);
      }
    */
    /*List<String> nounPhrases = getNounPhrases(test);
    System.out.println("*****************************");
    for (String s :nounPhrases) {
         System.out.println("Final NPs: "+s);
      }
    nounPhrases = getNounPhrases1(test);
    System.out.println("*****************************");
      for (String s :nounPhrases) {
         System.out.println("Final NPs: "+s);
      }*/
    /*}*/

}