coreferenceresolver.util.Util.java Source code

Introduction

Here is the source code for coreferenceresolver.util.Util.java
Source

/*
x * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package coreferenceresolver.util;

import coreferenceresolver.element.CRFToken;
import coreferenceresolver.element.CorefChain;
import coreferenceresolver.process.FeatureExtractor;
import coreferenceresolver.element.NounPhrase;
import coreferenceresolver.element.Review;
import coreferenceresolver.element.Sentence;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.trees.CollinsHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *
 * @author TRONGNGHIA
 */
public class Util {

    public static int POSITIVE = 1;
    public static int NEGATIVE = -1;
    public static int NEUTRAL = 0;
    private static final ArrayList<String> TO_BES = new ArrayList<String>(
            Arrays.asList("is", "'s", "are", "'re", "was", "were", "been", "be"));

    private static final String DISCARDED_PERSONAL_PRONOUNS = ";i;me;myself;we;us;ourselves;you;yourself;yourselves;he;him;himself;she;her;herself;anyone;someone;somebody;everyone;anybody;everybody;nobody;people;";

    private static final String DISCARDED_TIME_NOUNS = ";minute;minutes;hour;hours;day;days;week;weeks;month;months;year;years;january;february;march;april;may;june;july;august;september;october;november;december;monday;tuesday;wednesday;thursday;friday;saturday;sunday;today;yesterday;tomorrow;";

    private static final String DISCARDED_STOP_WORDS = ";there;etc;oh;";

    //private static final String DISCARDED_NUMBER_NOUN_POS = "CD"; //one, two, three
    private static final String DISCARDED_QUANTITY_NOUNS = ";lot;lots;number;total;amount;little;much;many;ton;tons;plenty;some;bit;a;";

    private static final String DISCARDED_TIME_REGEX = "([0-9]+:[0-9]+)|([0-9]+[ ]*(AM|PM)) | (AM|PM)";

    private static final String DEP_RELATIONS = ";nn;acomp;advmod;amod;det;dobj;infmod;iobj;measure;nsubj;nsubjpass;partmod;prep;rcmod;xcomp;xsubj;";

    private static Boolean checkNPhasOW = false;
    //Each PMI appears 1 times.
    private static ArrayList<Float> listAllPMI = new ArrayList<Float>();
    //List PMI of each NP2 with NP1
    private static ArrayList<Float> listRawPMI = new ArrayList<Float>();

    private static String sDataset = null;

    public static String getDataset() {
        return sDataset;
    }

    public static void setDataset(String s) {
        sDataset = s;
    }

    public static void extractFeatures(Review review, BufferedWriter bw, boolean forTraining) throws IOException {
        //Set Opinion Words for Noun Phrases
        //        for (int i = 0; i < review.getSentences().size(); i++) {
        //            FeatureExtractor.setNPForOPInSentence(review.getSentences().get(i));
        //        }

        for (int i = review.getNounPhrases().size() - 1; i >= 1; i--) {
            checkNPhasOW = true;
            NounPhrase np2 = review.getNounPhrases().get(i);

            listAllPMI.clear();
            listRawPMI.clear();
            //Find PMI of NP2 with NP1
            if (np2.getOpinionWords().isEmpty()) {
                System.out.print(np2.getNpNode().getLeaves().toString());
                checkNPhasOW = false;
            } else {
                for (int j = i - 1; j >= 0; j--) {
                    NounPhrase np1 = review.getNounPhrases().get(j);
                    if (np1.getType() == 0 || np2.getType() == 0 || np1.getType() == 3 || np2.getType() == 3) {
                        Float rawPMIof2NP = FeatureExtractor.PMI(np2, np1);
                        listRawPMI.add(rawPMIof2NP);
                        if (!listAllPMI.contains(rawPMIof2NP)) {
                            listAllPMI.add(rawPMIof2NP);
                        }
                    }
                }

                Collections.sort(listAllPMI, Collections.reverseOrder());
            }

            //Create all pair of 2 NPs
            int k = 0;
            for (int j = i - 1; j >= 0; j--) {
                NounPhrase np1 = review.getNounPhrases().get(j);
                if (np1.getType() == 0 || np2.getType() == 0 || np1.getType() == 3 || np2.getType() == 3) {
                    createTest(np1, np2, review, bw, k);
                    k++;
                }
            }
        }
    }

    public static void initMarkupFile(Review review, FileWriter fw) throws IOException {

        String markupReview = review.getRawContent();
        for (int i = 0; i < review.getNounPhrases().size(); ++i) {
            NounPhrase curNp = review.getNounPhrases().get(i);

            int openNpOffset = curNp.getOffsetBegin() + i;
            markupReview = markupReview.substring(0, openNpOffset) + "<" + markupReview.substring(openNpOffset);
        }

        Pattern pattern = null;
        Matcher matcher = null;

        for (int i = 0; i < review.getNounPhrases().size(); ++i) {
            List<Integer> openNpOffsets = new ArrayList<>();
            for (int j = 0; j < markupReview.length(); ++j) {
                if (markupReview.charAt(j) == '<') {
                    openNpOffsets.add(j);
                }
            }

            NounPhrase curNp = review.getNounPhrases().get(i);
            String rawNp = review.getRawContent().substring(curNp.getOffsetBegin(), curNp.getOffsetEnd());

            String regex = specialRegex(rawNp);
            pattern = Pattern.compile(regex);
            String subString = markupReview.substring(openNpOffsets.get(i));
            matcher = pattern.matcher(subString);
            if (matcher.find()) {
                int replacedStringIndex = markupReview.substring(openNpOffsets.get(i)).indexOf(matcher.group());
                subString = markupReview.substring(openNpOffsets.get(i),
                        openNpOffsets.get(i) + replacedStringIndex + matcher.group().length()) + ">"
                        + markupReview
                                .substring(openNpOffsets.get(i) + replacedStringIndex + matcher.group().length());
            }

            markupReview = markupReview.substring(0, openNpOffsets.get(i)) + subString;
        }

        int npIndex = -1;
        int i = 0;
        while (i < markupReview.length()) {
            if (markupReview.charAt(i) == '<') {
                ++npIndex;
                String coref = npIndex + "," + "0" + "," + "0" + " ";
                markupReview = markupReview.substring(0, i) + "<" + coref + markupReview.substring(i + 1);
                i += coref.length();
            } else {
                ++i;
            }
        }

        fw.write(markupReview);
        fw.write("\n");
    }

    public static void readMarkupFile(List<Review> reviews, File markupFile)
            throws FileNotFoundException, IOException {
        BufferedReader br = new BufferedReader(new FileReader(markupFile));
        String line = "";

        int reviewId = 0;
        while ((line = br.readLine()) != null) {
            readMarkup(reviews, line, reviewId);
            ++reviewId;
        }
    }

    private static void readMarkup(List<Review> reviews, String markupLine, int reviewId) {
        List<NounPhrase> nounPhrases = reviews.get(reviewId).getNounPhrases();
        int charId = 0;
        int npId = 0;

        while (charId < markupLine.length()) {
            if (markupLine.charAt(charId) == '<') {
                String corefInfo = "";
                int j = 0;
                for (j = charId; j < markupLine.length(); ++j) {
                    if (markupLine.charAt(j) == ' ') {
                        break;
                    }
                    corefInfo += markupLine.charAt(j);

                }
                String[] corefInfos = corefInfo.split(",");
                int refId = Integer.valueOf(corefInfos[1]);
                int type = Integer.valueOf(corefInfos[2]);
                System.out.println("NP " + nounPhrases.get(npId).getReviewId() + " " + nounPhrases.get(npId).getId()
                        + " type " + type);
                nounPhrases.get(npId).setRefId(refId);
                nounPhrases.get(npId).setType(type);
                ++npId;
                charId = j;
            } else {
                ++charId;
            }
        }
    }

    public static void discardUnneccessaryNPs(Review review) {
        List nps = review.getNounPhrases();
        Iterator<NounPhrase> itr = nps.iterator();

        while (itr.hasNext()) {
            NounPhrase np = itr.next();
            if (isDiscardedPersonalPronounNP(np) || isDiscardedTimeNP(np) || isDiscardedCurrencyNP(np)
                    || isDiscardedStopWordNP(np) || isDiscardedQuantityNP(np) || isDiscardedPercentageNP(np)
                    || isDiscardedWrongCase(np)) {
                itr.remove();

                Iterator<NounPhrase> sentNPsItr = review.getSentences().get(np.getSentenceId()).getNounPhrases()
                        .iterator();
                while (sentNPsItr.hasNext()) {
                    NounPhrase npSent = sentNPsItr.next();
                    if (npSent.getId() == np.getId()) {
                        sentNPsItr.remove();
                        break;
                    }
                }
            }
        }

        for (int i = 0; i < review.getNounPhrases().size(); i++) {
            review.getNounPhrases().get(i).setId(i);
        }
    }

    private static boolean isDiscardedPersonalPronounNP(NounPhrase np) {
        if (np.getHeadNode() != null
                && DISCARDED_PERSONAL_PRONOUNS.contains(";" + np.getHeadNode().value().toLowerCase() + ";")) {
            return true;
        }
        return false;
    }

    private static boolean isDiscardedPercentageNP(NounPhrase np) {
        if (np.getHeadNode() != null && np.getHeadNode().value().contains("%")) {
            return true;
        }
        return false;
    }

    private static boolean isDiscardedTimeNP(NounPhrase np) {
        //Discard NP with HOUR LITERAL
        if (np.getHeadNode() != null && (np.getHeadNode().value().matches(DISCARDED_TIME_REGEX)
                || np.getHeadNode().value().toLowerCase().equals("am")
                || np.getHeadNode().value().toLowerCase().equals("pm"))) {
            return true;
        }

        //Discard NP with TIME LITERAL
        if (np.getHeadNode() != null
                && DISCARDED_TIME_NOUNS.contains(";" + np.getHeadNode().value().toLowerCase() + ";")) {
            return true;
        }
        //Discard NP with HEAD "time": the first time, the second time, ...        
        if (np.getHeadNode() != null && (np.getHeadNode().value().toLowerCase().equals("time")
                || np.getHeadNode().value().toLowerCase().equals("times"))) {
            for (int i = 0; i < np.getNpNode().getLeaves().size(); ++i) {
                if (np.getNpNode().getLeaves().get(i).value().equals("time")
                        || np.getNpNode().getLeaves().get(i).value().equals("times")) {
                    //NP starts with "time"
                    if (i == 0) {
                        return true;
                    }
                    String tokenBeforeHeadPOS = ((CoreLabel) np.getNpNode().getLeaves().get(i - 1).label())
                            .get(CoreAnnotations.PartOfSpeechAnnotation.class);
                    if (!tokenBeforeHeadPOS.equals("NN") && !tokenBeforeHeadPOS.equals("NNS")) {
                        return true;
                    }
                }
            }
        }

        return false;
    }

    //Discard all stop words: there, etc, oh, ...
    private static boolean isDiscardedStopWordNP(NounPhrase np) {
        if (np.getHeadNode() != null
                && DISCARDED_STOP_WORDS.contains(";" + np.getHeadNode().value().toLowerCase() + ";")) {
            return true;
        }
        return false;
    }

    //Discard all NP indicating quantity: lot, lots, number, total
    public static boolean isDiscardedQuantityNP(NounPhrase np) {
        if (np.getHeadNode() != null
                && DISCARDED_QUANTITY_NOUNS.contains(";" + np.getHeadNode().value().toLowerCase() + ";")) {
            return true;
        }
        return false;
    }

    //Discard NP type " 's "
    public static boolean isDiscardedWrongCase(NounPhrase np) {
        if (np.getHeadNode() != null && (np.getHeadLabel().equals("POS") || np.getHeadLabel().equals("RB"))) {
            return true;
        }
        return false;
    }

    private static boolean isDiscardedCurrencyNP(NounPhrase np) {
        if (np.getHeadNode() != null
                && (np.getHeadNode().value().contains("$") || np.getHeadNode().value().contains("dollar"))) {
            return true;
        }
        return false;
    }

    public static int retrieveOriginalSentiment(String word) {
        if (FeatureExtractor.NEGATIVE_WORDS.contains(";" + word.toLowerCase() + ";")) {
            return NEGATIVE;
        } else if (FeatureExtractor.POSITIVE_WORDS.contains(";" + word.toLowerCase() + ";")) {
            return POSITIVE;
        } else {
            return NEUTRAL;
        }
    }

    public static int reverseSentiment(int sentiment) {
        return sentiment == POSITIVE ? NEGATIVE : sentiment == NEGATIVE ? POSITIVE : 0;
    }

    public static void assignNounPhrases(List<NounPhrase> nounPhrases, List<Review> reviews) {
        CollinsHeadFinder headFinder = new CollinsHeadFinder();
        for (NounPhrase np : nounPhrases) {
            Review review = reviews.get(np.getReviewId());
            Sentence sentence = review.getSentences().get(np.getSentenceId());
            String npContent = "";
            for (CRFToken token : np.getCRFTokens()) {
                npContent += token.getWord() + " ";
            }

            //Initiate a NP Tree
            Tree npNode = initNPTree();
            for (CRFToken cRFToken : np.getCRFTokens()) {
                Tree cRFTokenTree = sentence.getTokens().get(cRFToken.getIdInSentence()).getTokenTree();
                npNode.addChild(cRFTokenTree);
            }
            np.setNpNode(npNode);
            np.setHeadNode(npNode.headTerminal(headFinder));

            int npOffsetBegin = sentence.getTokens().get(np.getCRFTokens().get(0).getIdInSentence())
                    .getOffsetBegin();
            np.setOffsetBegin(npOffsetBegin);
            int npOffsetEnd = sentence.getTokens()
                    .get(np.getCRFTokens().get(np.getCRFTokens().size() - 1).getIdInSentence()).getOffsetEnd();
            np.setOffsetEnd(npOffsetEnd);

            review.addNounPhrase(np);
            sentence.addNounPhrase(np);
            sentence.setSentimentForNPs();
        }
    }

    public static void initSentimentAndComparativesForNPs() {
        if (StanfordUtil.reviews == null) {
            System.out.println("StanfordUtil reviews has not been initialized");
            return;
        }
        for (Review review : StanfordUtil.reviews) {
            for (Sentence sentence : review.getSentences()) {
                //Set sentiment corresponding to each NP in the sentence
                sentence.setSentimentForNPs();
                sentence.initComparativeNPs();
            }
        }
    }

    public static void readDataset() throws FileNotFoundException, IOException {
        System.out.println("Reading dataset");
        File fData = new File(".\\dataset.txt");
        FileReader fReaderData = new FileReader(fData);
        BufferedReader buffReaderDict = new BufferedReader(fReaderData);
        String sData = null;
        String line;
        while ((line = buffReaderDict.readLine()) != null) {
            sData = sData + line + "\n";
        }
        System.out.println("End of Reading dataset");
        setDataset(sData);
    }

    public static void discardUnneccessaryChains(List<Review> reviews) {
        for (Review re : reviews) {
            Iterator<CorefChain> itr = re.getCorefChainsPredicted().iterator();
            while (itr.hasNext()) {
                CorefChain curCc = itr.next();
                boolean isSatisfied = false;
                for (int npId : curCc.getChain()) {
                    int curNpType = re.getNounPhrases().get(npId).getType();
                    if (curNpType == 0 || curNpType == 3) {
                        isSatisfied = true;
                        break;
                    }
                }
                if (!isSatisfied) {
                    itr.remove();
                }
            }
        }
    }

    private static Tree initNPTree() {
        Annotation document = new Annotation("Dog");
        StanfordUtil.pipeline.annotate(document);
        Tree node = null;
        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
            node = sentence.get(TreeCoreAnnotations.TreeAnnotation.class).children()[0];
        }
        node.removeChild(0);
        return node;
    }

    private static String specialRegex(String sequence) {
        return sequence.replaceAll("\\$", "[\\$]").replaceAll("[?]", "[\\?]").replaceAll("[*]", "[\\*]")
                .replaceAll("[+]", "[+]").replaceAll("[\\(]", "[(]<*").replaceAll("[\\)]", "[)]")
                .replaceAll("\\s", " <*");
    }

    private static void createTest(NounPhrase np1, NounPhrase np2, Review review, BufferedWriter bwtrain,
            Integer IdPMIinList) throws IOException {
        bwtrain.write(np1.getReviewId() + ",");
        bwtrain.write(np1.getId() + ",");
        bwtrain.write(np2.getId() + ",");
        bwtrain.write(FeatureExtractor.isPronoun(np1).toString() + ",");
        bwtrain.write(FeatureExtractor.isPronoun(np2).toString() + ",");
        bwtrain.write(FeatureExtractor.isDefiniteNP(np2).toString() + ",");
        bwtrain.write(FeatureExtractor.isDemonstrativeNP(np2).toString() + ",");
        bwtrain.write(FeatureExtractor.countDistance(np1, np2) + ",");
        bwtrain.write(FeatureExtractor.numberAgreementExtract(np1, np2) + ",");
        bwtrain.write(FeatureExtractor.isBetween2Extract(review, np1, np2).toString() + ",");
        bwtrain.write(FeatureExtractor.hasBetween2Extract(review, np1, np2).toString() + ",");
        bwtrain.write(FeatureExtractor.comparativeIndicatorExtract(review, np1, np2).toString() + ",");
        bwtrain.write(FeatureExtractor.sentimentConsistencyExtract(np1, np2) + ",");
        bwtrain.write(FeatureExtractor.isBothPropername(np1, np2).toString() + ",");
        //        bwtrain.write(FeatureExtractor.hasProperName(np1, StanfordUtil.reviews.get(np1.getReviewId()).getSentences().get(np1.getSentenceId())).toString() + ",");

        //        bwtrain.write(FeatureExtractor.hasProperName(np2, StanfordUtil.reviews.get(np2.getReviewId()).getSentences().get(np2.getSentenceId())).toString() + ",");
        bwtrain.write(FeatureExtractor.isSubString(np1, np2) + ",");
        bwtrain.write(FeatureExtractor.isHeadMatch(np1, np2) + ",");
        bwtrain.write(FeatureExtractor.isExactMatch(np1, np2) + ",");

        if (checkNPhasOW == false) {
            bwtrain.write(10 + ",");
        } else {
            if (listRawPMI.get(IdPMIinList) == 0) {
                bwtrain.write(4 + ",");
            } else if (listAllPMI.indexOf(listRawPMI.get(IdPMIinList)) < 4) {
                bwtrain.write(listAllPMI.indexOf(listRawPMI.get(IdPMIinList)) + ",");
            } else {
                bwtrain.write(4 + ",");
            }
        }
        bwtrain.write(FeatureExtractor.isRelativePronounNPs(np1, np2) + ",");
        bwtrain.write(FeatureExtractor.isCorefTest(np1, np2).toString());
        bwtrain.newLine();
    }

    /************************************************/
    /***********Cross validation**********************/
    public static void setInstancesForReviews(Review review) throws IOException {
        //Set Opinion Words for Noun Phrases
        for (int i = 0; i < review.getSentences().size(); i++) {
            FeatureExtractor.setNPForOPInSentence(review.getSentences().get(i));
        }

        for (int i = review.getNounPhrases().size() - 1; i >= 1; i--) {
            checkNPhasOW = true;
            NounPhrase np2 = review.getNounPhrases().get(i);

            listAllPMI.clear();
            listRawPMI.clear();
            //Find PMI of NP2 with NP1
            if (np2.getOpinionWords().isEmpty()) {
                checkNPhasOW = false;
            } else {
                for (int j = i - 1; j >= 0; j--) {
                    NounPhrase np1 = review.getNounPhrases().get(j);
                    if (np1.getType() == 0 || np2.getType() == 0 || np1.getType() == 3 || np2.getType() == 3) {
                        Float rawPMIof2NP = FeatureExtractor.PMI(np2, np1);
                        listRawPMI.add(rawPMIof2NP);
                        if (!listAllPMI.contains(rawPMIof2NP)) {
                            listAllPMI.add(rawPMIof2NP);
                        }
                    }
                }

                Collections.sort(listAllPMI, Collections.reverseOrder());

            }

            //Create all pair of 2 NPs
            int k = 0;
            for (int j = i - 1; j >= 0; j--) {
                NounPhrase np1 = review.getNounPhrases().get(j);
                if (np1.getType() == 0 || np2.getType() == 0 || np1.getType() == 3 || np2.getType() == 3) {
                    createInstance(np1, np2, review, k);
                    k++;
                }
            }
        }
    }

    private static void createInstance(NounPhrase np1, NounPhrase np2, Review review, Integer IdPMIinList)
            throws IOException {
        String instance = "";
        instance += np1.getReviewId() + ",";
        instance += np1.getId() + ",";
        instance += np2.getId() + ",";
        instance += FeatureExtractor.isPronoun(np1).toString() + ",";
        instance += FeatureExtractor.isPronoun(np2).toString() + ",";
        instance += FeatureExtractor.isDefiniteNP(np2).toString() + ",";
        instance += FeatureExtractor.isDemonstrativeNP(np2).toString() + ",";
        instance += FeatureExtractor.countDistance(np1, np2) + ",";
        instance += FeatureExtractor.numberAgreementExtract(np1, np2) + ",";
        instance += FeatureExtractor.isBetween2Extract(review, np1, np2).toString() + ",";
        instance += FeatureExtractor.hasBetween2Extract(review, np1, np2).toString() + ",";
        instance += FeatureExtractor.comparativeIndicatorExtract(review, np1, np2).toString() + ",";
        instance += FeatureExtractor.sentimentConsistencyExtract(np1, np2) + ",";
        instance += FeatureExtractor.isBothPropername(np1, np2).toString() + ",";
        instance += FeatureExtractor.isSubString(np1, np2) + ",";
        instance += FeatureExtractor.isHeadMatch(np1, np2) + ",";
        instance += FeatureExtractor.isExactMatch(np1, np2) + ",";

        if (checkNPhasOW == false) {
            instance += 10 + ",";
        } else {
            if (listRawPMI.get(IdPMIinList) == 0) {
                instance += 4 + ",";
            } else if (listAllPMI.indexOf(listRawPMI.get(IdPMIinList)) < 4) {
                instance += listAllPMI.indexOf(listRawPMI.get(IdPMIinList)) + ",";
            } else {
                instance += 4 + ",";
            }
        }
        instance += FeatureExtractor.isRelativePronounNPs(np1, np2) + ",";
        instance += FeatureExtractor.isCorefTest(np1, np2).toString();
        review.addInstance(instance);
    }
}