Example usage for edu.stanford.nlp.trees Tree getLeaves

Introduction

In this page you can find the example usage for edu.stanford.nlp.trees Tree getLeaves.

Prototype

public <T extends Tree> List<T> getLeaves()

Source Link

Document

Gets the leaves of the tree.

Usage

From source file:ConstituencyParse.java

License:Apache License

public int[] constTreeParents(Tree tree) {
    Tree binarized = binarizer.transformTree(tree);
    Tree collapsedUnary = transformer.transformTree(binarized);
    Trees.convertToCoreLabels(collapsedUnary);
    collapsedUnary.indexSpans();/*from  w  w  w . j  av  a2 s.  c o  m*/
    List<Tree> leaves = collapsedUnary.getLeaves();
    int size = collapsedUnary.size() - leaves.size();
    int[] parents = new int[size];
    HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();

    int idx = leaves.size();
    int leafIdx = 0;
    for (Tree leaf : leaves) {
        Tree cur = leaf.parent(collapsedUnary); // go to preterminal
        int curIdx = leafIdx++;
        boolean done = false;
        while (!done) {
            Tree parent = cur.parent(collapsedUnary);
            if (parent == null) {
                parents[curIdx] = 0;
                break;
            }

            int parentIdx;
            int parentNumber = parent.nodeNumber(collapsedUnary);
            if (!index.containsKey(parentNumber)) {
                parentIdx = idx++;
                index.put(parentNumber, parentIdx);
            } else {
                parentIdx = index.get(parentNumber);
                done = true;
            }

            parents[curIdx] = parentIdx + 1;
            cur = parent;
            curIdx = parentIdx;
        }
    }

    return parents;
}

From source file:KleinBilingualParser.java

public static void main(String[] args) {
    boolean trainF = false;
    boolean trainE = false;
    boolean bitrainE = false;
    boolean bitrainF = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPathF = null;//w ww .ja va2 s  .co m
    Treebank testTreebankF = null;
    Treebank tuneTreebankF = null;
    String testPathF = null;
    FileFilter testFilterF = null;
    String treebankPathE = null;
    Treebank testTreebankE = null;
    Treebank tuneTreebankE = null;
    String testPathE = null;
    FileFilter testFilterE = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilterF = null;
    FileFilter trainFilterE = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;

    String trainAlignFile = null;
    String testAlignFile = null;
    String bitrainPathE = null;
    FileFilter bitrainFilterE = null;
    String bitrainPathF = null;
    FileFilter bitrainFilterF = null;
    Treebank bitrainTreebankF = null;
    Treebank bitrainTreebankE = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info(
                "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }

    Options fOp = new Options();
    Options eOp = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encodingF = null;

    // while loop through option arguments
    while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathF = treebankDescription.first();
            trainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathF = treebankDescription.first();
            bitrainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encodingF = args[argIndex + 1];
            fOp.tlpParams.setInputEncoding(encodingF);
            fOp.tlpParams.setOutputEncoding(encodingF);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathF = treebankDescription.first();
            testFilterF = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = fOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }

        System.out.println(argIndex + " " + args.length);
    } // end while loop through arguments for french

    argIndex++;//go to english arguments

    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathE = treebankDescription.first();
            trainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathE = treebankDescription.first();
            bitrainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathE = treebankDescription.first();
            testFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-trainAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            trainAlignFile = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-testAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testAlignFile = treebankDescription.first();
        } else {
            int oldIndex = argIndex;
            argIndex = eOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    } // end while loop through arguments for english

    //    if (!train && fOp.testOptions.verbose) {
    //      StringUtils.logInvocationString(log, args);
    //    }
    LexicalizedParser lpF; // always initialized in next if-then-else block
    LexicalizedParser lpE;
    //TRAIN A PARSER
    // so we train a parser using the treebank
    GrammarCompactor compactorF = null;
    GrammarCompactor compactorE = null;
    if (fOp.trainOptions.compactGrammar() == 3) {
        compactorF = new ExactGrammarCompactor(fOp, false, false);
    }
    if (eOp.trainOptions.compactGrammar() == 3) {
        compactorE = new ExactGrammarCompactor(eOp, false, false);
    }

    Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF);
    Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE);

    fOp.testOptions.quietEvaluation = true;
    eOp.testOptions.quietEvaluation = true;

    lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF,
            null);
    lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE,
            null);

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (bitrainFilterF != null || bitrainPathF != null) {
        if (bitrainPathF == null) {
            //?
            if (treebankPathF == null) {
                throw new RuntimeException("No bitrain treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                bitrainPathF = treebankPathF;
            }
        }
        bitrainTreebankF = fOp.tlpParams.testMemoryTreebank();
        bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF);
    }

    if (bitrainFilterE != null || bitrainPathE != null) {
        if (bitrainPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                bitrainPathE = treebankPathE;
            }
        }
        bitrainTreebankE = eOp.tlpParams.testMemoryTreebank();
        bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE);
    }

    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (testFilterF != null || testPathF != null) {
        if (testPathF == null) {
            if (treebankPathF == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                testPathF = treebankPathF;
            }
        }
        testTreebankF = fOp.tlpParams.testMemoryTreebank();
        testTreebankF.loadPath(testPathF, testFilterF);
    }

    fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters()));

    if (testFilterE != null || testPathE != null) {
        if (testPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                testPathE = treebankPathE;
            }
        }
        testTreebankE = eOp.tlpParams.testMemoryTreebank();
        testTreebankE.loadPath(testPathE, testFilterE);
    }

    eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters()));

    //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX

    double[] weights = new double[8];
    double diff;
    weights[0] = 0.01;
    weights[1] = -0.002;
    weights[2] = 0.002;
    weights[3] = 0.002;
    weights[4] = 0.002;
    weights[5] = 0.002;
    weights[6] = -0.002;
    weights[7] = -0.002;

    ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null;
    ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null;
    //String alignFile="../../berkeleyaligner/output/test.align";
    try {

        AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile);
        bitrainAlignments = trainAP.createAlignments();

        AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile);
        testAlignments = testAP.createAlignments();

    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    int kE = 10;
    int kF = 10;
    int numFeatures = 8;
    int numBigSentences = 0;
    do {
        diff = 0.0;
        Iterator<Tree> eTrees = bitrainTreebankE.iterator();
        Iterator<Tree> fTrees = bitrainTreebankF.iterator();
        Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator();
        numBigSentences = 0;

        //features are used in the order they are defined
        double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF];
        int ePsGold[] = new int[bitrainTreebankE.size()];
        int fPsGold[] = new int[bitrainTreebankF.size()];

        int i = 0;
        while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) {
            HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next();
            Tree fTree = fTrees.next();
            Tree eTree = eTrees.next();

            if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) {
                //System.out.println("Too big : " + i);
                numBigSentences++;

                fPsGold[i] = 3;
                ePsGold[i] = 3;

                i++;
                continue;
            }

            List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
            List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

            LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
            LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

            lpqE.parse(sentenceE);
            lpqF.parse(sentenceF);

            List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
            List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

            fPsGold[i] = 3;
            ePsGold[i] = 3;

            int j = 0;
            int k = 0;

            for (ScoredObject<Tree> eScoredObj : kBestE) {
                k = 0;
                for (ScoredObject<Tree> fScoredObj : kBestF) {
                    eScoredObj.object().setSpans();
                    fScoredObj.object().setSpans();
                    HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(),
                            fScoredObj.object(), weights, alignMap);

                    //had to reduce likelihood scores by factor of 10 to keep the optimizer working
                    A[i][0][j][k] = eScoredObj.score() / 1000;
                    A[i][1][j][k] = fScoredObj.score() / 1000;

                    for (Map.Entry entry : alignment.entrySet()) {
                        Tree nodeF = (Tree) entry.getKey();
                        Tree nodeE = (Tree) entry.getValue();

                        A[i][2][j][k] += spanDiff(nodeF, nodeE);
                        A[i][3][j][k] += numChildren(nodeF, nodeE);
                        A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap);
                        A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                        A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                        A[i][7][j][k] += bias(nodeF, nodeE);
                    }

                    k++;
                }
                j++;
            }
            //System.out.println("Sentence " + i);
            i++;
        }

        ///////////////////////
        //
        //  MALLET optimizer
        //
        ///////////////////////
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();
        System.out.println("Beginning convex optimization...");
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();

        OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold);
        Optimizer optimizer = new LimitedMemoryBFGS(optimizable);

        boolean converged = false;

        try {
            converged = optimizer.optimize();
        } catch (IllegalArgumentException e) {
            // This exception may be thrown if L-BFGS
            //  cannot step in the current direction.
            // This condition does not necessarily mean that
            //  the optimizer has failed, but it doesn't want
            //  to claim to have succeeded...
        } catch (cc.mallet.optimize.OptimizationException e) {
            System.out.println(e.getMessage());
        }

        for (int x = 0; x < weights.length; x++) {
            diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]);
            weights[x] = optimizable.getParameter(x);
            System.out.print(weights[x] + ", ");
        }
        System.out.println();
        diff /= weights.length;

        System.out.println("Current difference: " + diff);
    } while (diff > 0.0005);

    //TESTING BILINGUAL PARSER

    Treebank bilingTestTreebankF = testTreebankF;
    Treebank bilingTestTreebankE = testTreebankE;
    Iterator<Tree> eTreesBling = testTreebankE.iterator();
    Iterator<Tree> fTreesBling = testTreebankF.iterator();

    boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages"));
    boolean runningAveragesE = Boolean.parseBoolean(eOp.testOptions.evals.getProperty("runningAverages"));

    AbstractEval pcfgLBf = new Evalb("pcfg LP/LR", runningAveragesF);
    AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF);

    AbstractEval pcfgLBe = new Evalb("pcfg LP/LR", runningAveragesE);
    AbstractEval factLBe = new Evalb("factor LP/LR", runningAveragesE);

    int i = 0;
    Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator();
    while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) {
        HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next();

        Tree fTree = fTreesBling.next();
        Tree eTree = eTreesBling.next();

        List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
        List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

        LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
        LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

        lpqE.parse(sentenceE);
        lpqF.parse(sentenceF);

        List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
        List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

        int j = 0;
        int k = 0;

        double maxScore = -Double.MAX_VALUE;
        Tree bestFtree = null;
        Tree bestEtree = null;

        for (ScoredObject<Tree> eScoredObj : kBestE) {
            k = 0;
            for (ScoredObject<Tree> fScoredObj : kBestF) {
                eScoredObj.object().setSpans();
                fScoredObj.object().setSpans();
                HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(),
                        weights, alignMap);

                double currentScore = 0.0;

                for (Map.Entry entry : alignment.entrySet()) {
                    Tree nodeF = (Tree) entry.getKey();
                    Tree nodeE = (Tree) entry.getValue();

                    currentScore += weights[0] * eScoredObj.score() / 1000;
                    currentScore += weights[1] * fScoredObj.score() / 1000;
                    currentScore += weights[2] * spanDiff(nodeF, nodeE);
                    currentScore += weights[3] * numChildren(nodeF, nodeE);
                    currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap);
                    currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                    currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                    currentScore += weights[7] * bias(nodeF, nodeE);
                }

                if (currentScore > maxScore) {
                    maxScore = currentScore;

                    bestFtree = fScoredObj.object();
                    bestEtree = eScoredObj.object();
                }

                k++;
            }
            j++;
        }
        i++;

        pcfgLBe.evaluate(bestEtree, eTree);
        factLBe.evaluate(bestEtree, eTree);

        pcfgLBf.evaluate(bestFtree, fTree);
        factLBf.evaluate(bestFtree, fTree);
    }

    System.out.println("------------------------");
    System.out.println("    English Results     ");
    System.out.println("------------------------");
    System.out.println("PCFG labeled f1: " + pcfgLBe.getEvalbF1Percent());
    System.out.println("Factored labeled f1: " + factLBe.getEvalbF1Percent());

    System.out.println("------------------------");
    System.out.println("     French Results     ");
    System.out.println("------------------------");
    System.out.println("PCFG labeled f1: " + pcfgLBf.getEvalbF1Percent());
    System.out.println("Factored labeled f1: " + factLBf.getEvalbF1Percent());
    System.out.println("------------------------");
    System.out.println("Number of sentences too big: " + numBigSentences);
}

From source file:KleinBilingualParser.java

private static double spanDiff(Tree nodeF, Tree nodeE) {
    return ((double) Math.abs(nodeF.getLeaves().size() - nodeE.getLeaves().size())) / 100;
}

From source file:KleinBilingualParser.java

private static HashMap<Tree, Tree> getHungarianAlignment(Tree eParseTree, Tree fParseTree, double[] weights,
        HashMap<Integer, ArrayList<Integer>> alignMap) {
    // remember to ignore the top two weights because they are monolingual features
    int numFrenchNodes = fParseTree.size() - fParseTree.getLeaves().size();
    int numEnglishNodes = eParseTree.size() - eParseTree.getLeaves().size();

    double[][] costMatrix = new double[numFrenchNodes][numEnglishNodes];

    int i, j;/*from   w w w .j a  v  a 2 s . c om*/
    i = 0;

    for (Tree fSubTree : fParseTree) {
        if (!fSubTree.isLeaf()) {
            j = 0;
            for (Tree eSubTree : eParseTree) {
                if (!eSubTree.isLeaf()) {
                    //IF IT GETS TOO SLOW DON'T COMPUTE WORD ALIGNMENT FEATURES FOR LARGE SENTENCES
                    costMatrix[i][j] = weights[2] * spanDiff(fSubTree, eSubTree)
                            + weights[3] * numChildren(fSubTree, eSubTree)
                            + weights[7] * bias(fSubTree, eSubTree);
                    if (numFrenchNodes < 50 && numEnglishNodes < 50) {
                        costMatrix[i][j] += weights[4] * insideBoth(fSubTree, eSubTree, alignMap)
                                + weights[5] * insideSrcOutsideTgt(fSubTree, eSubTree, alignMap)
                                + weights[6] * insideTgtOutsideSrc(fSubTree, eSubTree, alignMap);
                    }
                    costMatrix[i][j] = 0 - costMatrix[i][j];
                    j++;
                }
            }
            i++;
        }
    }

    HungarianAlgorithm hungAlgSolver = new HungarianAlgorithm(costMatrix);
    int[] assignments = hungAlgSolver.execute();

    HashMap<Tree, Tree> alignment = new HashMap<>();

    i = 0;
    for (Tree fSubTree : fParseTree) {
        if (!fSubTree.isLeaf()) {
            j = 0;
            for (Tree eSubTree : eParseTree) {
                if (!eSubTree.isLeaf()) {
                    if (j == assignments[i]) {
                        alignment.put(fSubTree, eSubTree);
                    }
                    j++;
                }
            }
            i++;
        }
    }

    return alignment;
}

From source file:EddyRoseDomainAdaptation.java

public static void main(String[] args) {
    boolean trainF = false;
    boolean trainE = false;
    boolean bitrainE = false;
    boolean bitrainF = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPathF = null;//from ww  w  . j ava  2  s .co  m
    Treebank testTreebankF = null;
    Treebank seqTestTreebank = null;
    Treebank tuneTreebankF = null;
    String testPathF = null;
    FileFilter testFilterF = null;
    String treebankPathE = null;
    Treebank testTreebankE = null;
    Treebank tuneTreebankE = null;
    String testPathE = null;
    FileFilter testFilterE = null;
    String seqTestPath = null;
    FileFilter seqTestFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilterF = null;
    FileFilter trainFilterE = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;

    String trainAlignFile = null;
    String testAlignFile = null;
    String bitrainPathE = null;
    FileFilter bitrainFilterE = null;
    String bitrainPathF = null;
    FileFilter bitrainFilterF = null;
    Treebank bitrainTreebankF = null;
    Treebank bitrainTreebankE = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info(
                "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }

    Options fOp = new Options();
    Options eOp = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encodingF = null;

    // while loop through option arguments
    while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathF = treebankDescription.first();
            trainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathF = treebankDescription.first();
            bitrainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encodingF = args[argIndex + 1];
            fOp.tlpParams.setInputEncoding(encodingF);
            fOp.tlpParams.setOutputEncoding(encodingF);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathF = treebankDescription.first();
            testFilterF = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = fOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }

        System.out.println(argIndex + " " + args.length);
    } // end while loop through arguments for french

    argIndex++;//go to english arguments

    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathE = treebankDescription.first();
            trainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathE = treebankDescription.first();
            bitrainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathE = treebankDescription.first();
            testFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-seqtest")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            seqTestPath = treebankDescription.first();
            seqTestFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-trainAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            trainAlignFile = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-testAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testAlignFile = treebankDescription.first();
        } else {
            int oldIndex = argIndex;
            argIndex = eOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    } // end while loop through arguments for english

    //    if (!train && fOp.testOptions.verbose) {
    //      StringUtils.logInvocationString(log, args);
    //    }

    LexicalizedParser lpF; // always initialized in next if-then-else block
    LexicalizedParser lpE;
    //TRAIN A PARSER
    // so we train a parser using the treebank

    GrammarCompactor compactorF = null;
    if (fOp.trainOptions.compactGrammar() == 3) {
        compactorF = new ExactGrammarCompactor(fOp, false, false);
    }
    Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF);
    fOp.testOptions.quietEvaluation = true;

    GrammarCompactor compactorE = null;
    if (eOp.trainOptions.compactGrammar() == 3) {
        compactorE = new ExactGrammarCompactor(eOp, false, false);
    }

    Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE);

    eOp.testOptions.quietEvaluation = true;

    lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF,
            null);
    lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE,
            null);

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (bitrainFilterF != null || bitrainPathF != null) {
        if (bitrainPathF == null) {
            //?
            if (treebankPathF == null) {
                throw new RuntimeException("No bitrain treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                bitrainPathF = treebankPathF;
            }
        }
        bitrainTreebankF = fOp.tlpParams.testMemoryTreebank();
        bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF);
    }

    if (bitrainFilterE != null || bitrainPathE != null) {
        if (bitrainPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                bitrainPathE = treebankPathE;
            }
        }
        bitrainTreebankE = eOp.tlpParams.testMemoryTreebank();
        bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE);
    }

    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (testFilterF != null || testPathF != null) {
        if (testPathF == null) {
            if (treebankPathF == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                testPathF = treebankPathF;
            }
        }
        testTreebankF = fOp.tlpParams.testMemoryTreebank();
        testTreebankF.loadPath(testPathF, testFilterF);
    }

    //generate sequioa treebank
    seqTestTreebank = fOp.tlpParams.testMemoryTreebank();
    seqTestTreebank.loadPath(seqTestPath, seqTestFilter);

    fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters()));

    if (testFilterE != null || testPathE != null) {
        if (testPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                testPathE = treebankPathE;
            }
        }
        testTreebankE = eOp.tlpParams.testMemoryTreebank();
        testTreebankE.loadPath(testPathE, testFilterE);
    }

    eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters()));

    //////////////////////
    // 
    // Self-Training
    //
    //////////////////////

    MemoryTreebank selfTrainInitTreebank = new MemoryTreebank();
    MemoryTreebank selfTrainFinalTreebank = new MemoryTreebank();

    selfTrainInitTreebank.addAll(trainTreebankF);
    selfTrainFinalTreebank.addAll(trainTreebankF);

    LexicalizedParser fSelfTrainInit = getParserFromTreebank(selfTrainInitTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);

    int z = 0;
    boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages"));
    AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF);

    for (Tree goldTree : testTreebankF) {
        List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords());
        Tree guessTree = fSelfTrainInit.parseTree(sentence);
        selfTrainFinalTreebank.add(guessTree);
        factLBf.evaluate(guessTree, goldTree);
        System.out.println("Self-training : " + (++z));
    }

    LexicalizedParser fSelfTrainFinal = getParserFromTreebank(selfTrainFinalTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluatorH = new EvaluateTreebank(fSelfTrainFinal);
    double scoreF1 = evaluatorH.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    Self Train Results  ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + scoreF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());

    //////////////////////
    //////////////////////

    //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX

    double[] weights = new double[8];
    double diff;
    weights[0] = 0.01;
    weights[1] = -0.002;
    weights[2] = 0.002;
    weights[3] = 0.002;
    weights[4] = 0.002;
    weights[5] = 0.002;
    weights[6] = -0.002;
    weights[7] = -0.002;

    ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null;
    ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null;
    //String alignFile="../../berkeleyaligner/output/test.align";
    try {

        AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile);
        bitrainAlignments = trainAP.createAlignments();

        AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile);
        testAlignments = testAP.createAlignments();

    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    int kE = 10;
    int kF = 10;
    int numFeatures = 8;
    int numBigSentences = 0;
    do {
        diff = 0.0;
        Iterator<Tree> eTrees = bitrainTreebankE.iterator();
        Iterator<Tree> fTrees = bitrainTreebankF.iterator();
        Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator();
        numBigSentences = 0;

        //features are used in the order they are defined
        double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF];
        int ePsGold[] = new int[bitrainTreebankE.size()];
        int fPsGold[] = new int[bitrainTreebankF.size()];

        int i = 0;
        while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) {
            HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next();
            Tree fTree = fTrees.next();
            Tree eTree = eTrees.next();

            if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) {
                //System.out.println("Too big : " + i);
                numBigSentences++;

                fPsGold[i] = 3;
                ePsGold[i] = 3;

                i++;
                continue;
            }

            List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
            List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

            LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
            LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

            lpqE.parse(sentenceE);
            lpqF.parse(sentenceF);

            List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
            List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

            fPsGold[i] = 3;
            ePsGold[i] = 3;

            int j = 0;
            int k = 0;

            for (ScoredObject<Tree> eScoredObj : kBestE) {
                k = 0;
                for (ScoredObject<Tree> fScoredObj : kBestF) {
                    eScoredObj.object().setSpans();
                    fScoredObj.object().setSpans();
                    HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(),
                            fScoredObj.object(), weights, alignMap);

                    //had to reduce likelihood scores by factor of 10 to keep the optimizer working
                    A[i][0][j][k] = eScoredObj.score() / 1000;
                    A[i][1][j][k] = fScoredObj.score() / 1000;

                    for (Map.Entry entry : alignment.entrySet()) {
                        Tree nodeF = (Tree) entry.getKey();
                        Tree nodeE = (Tree) entry.getValue();

                        A[i][2][j][k] += spanDiff(nodeF, nodeE);
                        A[i][3][j][k] += numChildren(nodeF, nodeE);
                        A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap);
                        A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                        A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                        A[i][7][j][k] += bias(nodeF, nodeE);
                    }

                    k++;
                }
                j++;
            }
            //System.out.println("Sentence " + i);
            i++;
        }

        ///////////////////////
        //
        //  MALLET optimizer
        //
        ///////////////////////
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();
        System.out.println("Beginning convex optimization...");
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();

        OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold);
        Optimizer optimizer = new LimitedMemoryBFGS(optimizable);

        boolean converged = false;

        try {
            converged = optimizer.optimize();
        } catch (IllegalArgumentException e) {
            // This exception may be thrown if L-BFGS
            //  cannot step in the current direction.
            // This condition does not necessarily mean that
            //  the optimizer has failed, but it doesn't want
            //  to claim to have succeeded...
        } catch (cc.mallet.optimize.OptimizationException e) {
            System.out.println(e.getMessage());
        }

        for (int x = 0; x < weights.length; x++) {
            diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]);
            weights[x] = optimizable.getParameter(x);
            System.out.print(weights[x] + ", ");
        }
        System.out.println();
        diff /= weights.length;

        System.out.println("Current difference: " + diff);
    } while (diff > 0.0005);

    //GENERATE TRAINING DATA USING KLEIN RERANKER
    //assumes the 'test' data from KleinBilingualParser.java is the unannotated data
    //that the reranker has to annotate.

    factLBf = new Evalb("factor LP/LR", runningAveragesF);
    MemoryTreebank eddyRoseFullTrainTreebank = new MemoryTreebank();
    eddyRoseFullTrainTreebank.addAll(trainTreebankF);
    eddyRoseFullTrainTreebank.addAll(bitrainTreebankF);

    Treebank unannotTreebankF = testTreebankF;
    Treebank annotTreebankE = testTreebankE;
    Iterator<Tree> eTreesBling = unannotTreebankF.iterator();
    Iterator<Tree> fTreesBling = annotTreebankE.iterator();

    int i = 0;
    Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator();
    while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) {
        HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next();

        Tree fTree = fTreesBling.next();
        Tree eTree = eTreesBling.next();

        List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
        LexicalizedParserQuery lpqF = (LexicalizedParserQuery) fSelfTrainFinal.parserQuery();
        lpqF.parse(sentenceF);
        List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);

        int j = 0;
        int k = 0;

        double maxScore = -Double.MAX_VALUE;
        Tree bestFtree = null;

        for (ScoredObject<Tree> fScoredObj : kBestF) {
            eTree.setSpans();
            fScoredObj.object().setSpans();
            HashMap<Tree, Tree> alignment = getHungarianAlignment(eTree, fScoredObj.object(), weights,
                    alignMap);

            double currentScore = 0.0;

            for (Map.Entry entry : alignment.entrySet()) {
                Tree nodeF = (Tree) entry.getKey();
                Tree nodeE = (Tree) entry.getValue();

                currentScore += weights[0] * 0.0;//because gold standard tree is assumed to have probability 1
                currentScore += weights[1] * fScoredObj.score() / 1000;
                currentScore += weights[2] * spanDiff(nodeF, nodeE);
                currentScore += weights[3] * numChildren(nodeF, nodeE);
                currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap);
                currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                currentScore += weights[7] * bias(nodeF, nodeE);
            }

            if (currentScore > maxScore) {
                maxScore = currentScore;

                bestFtree = fScoredObj.object();
            }

            k++;
        }
        i++;

        System.out.println("Reranker " + i);
        eddyRoseFullTrainTreebank.add(bestFtree);
        factLBf.evaluate(bestFtree, fTree);
    }

    LexicalizedParser lpEddyRose = getParserFromTreebank(eddyRoseFullTrainTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluator = new EvaluateTreebank(lpEddyRose);
    double eddyRoseF1 = evaluator.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    EddyRose Results    ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + eddyRoseF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());
}

From source file:artinex.TypDep.java

public static void main(String[] args) {
    String str = "What is index in array";
    TypDep parser = new TypDep();
    Tree tree = parser.parse(str);

    List<Tree> leaves = tree.getLeaves();
    // Print words and Pos Tags
    for (Tree leaf : leaves) {
        Tree parent = leaf.parent(tree);
        System.out.print(leaf.label().value() + "-" + parent.label().value() + " ");
    }/*from  ww  w.  j  av  a 2s.co m*/
    System.out.println();

    //Type dependencies

    // Tree tree1 = str.get(TreeAnnotation.class);
    // Get dependency tree
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> td = gs.typedDependenciesCollapsed();
    System.out.println(td);

}

From source file:at.ac.tuwien.inso.subcat.utility.sentiment.SentimentAnalyser.java

License:Open Source License

public SentimentBlock get(String str) {
    if (pipeline == null) {
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
        pipeline = new StanfordCoreNLP(props);
    }//from  ww w  . ja  v a 2 s.  c o m

    LinkedList<SentenceSentiment> sentiments = new LinkedList<SentenceSentiment>();
    int[] classes = new int[5];

    double positiveSum = 0;
    double somewhatPositiveSum = 0;
    double neutralSum = 0;
    double somewhatNegativeSum = 0;
    double negativeSum = 0;

    double positiveWSum = 0;
    double somewhatPositiveWSum = 0;
    double neutralWSum = 0;
    double somewhatNegativeWSum = 0;
    double negativeWSum = 0;

    int words = 0;

    Annotation annotation = pipeline.process(str);
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        Tree tree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class);
        int sentenceWordCount = tree.getLeaves().size();

        // TODO: calculate it instead
        int predictedClass = RNNCoreAnnotations.getPredictedClass(tree);
        SimpleMatrix matrix = RNNCoreAnnotations.getPredictions(tree);

        classes[predictedClass]++;

        SentenceSentiment sentiment = new SentenceSentiment(matrix.get(0, 0), matrix.get(1, 0),
                matrix.get(2, 0), matrix.get(3, 0), matrix.get(4, 0), predictedClass, sentenceWordCount);

        positiveSum += sentiment.getPositive();
        somewhatPositiveSum += sentiment.getSomewhatPositive();
        neutralSum += sentiment.getNeutral();
        somewhatNegativeSum += sentiment.getSomewhatNegative();
        negativeSum += sentiment.getNegative();

        positiveWSum += sentiment.getPositive() * sentenceWordCount;
        somewhatPositiveWSum += sentiment.getSomewhatPositive() * sentenceWordCount;
        neutralWSum += sentiment.getNeutral() * sentenceWordCount;
        somewhatNegativeWSum += sentiment.getSomewhatNegative() * sentenceWordCount;
        negativeWSum += sentiment.getNegative() * sentenceWordCount;

        words += sentenceWordCount;

        sentiments.add(sentiment);
    }

    double positiveMean = 0;
    double somewhatPositiveMean = 0;
    double neutralMean = 0;
    double somewhatNegativeMean = 0;
    double negativeMean = 0;

    double positiveWMean = 0;
    double somewhatPositiveWMean = 0;
    double neutralWMean = 0;
    double somewhatNegativeWMean = 0;
    double negativeWMean = 0;

    if (sentiments.size() > 0) {
        positiveMean = positiveSum / sentiments.size();
        somewhatPositiveMean = somewhatPositiveSum / sentiments.size();
        neutralMean = neutralSum / sentiments.size();
        somewhatNegativeMean = somewhatNegativeSum / sentiments.size();
        negativeMean = negativeSum / sentiments.size();
    }

    if (words > 0) {
        positiveWMean = positiveWSum / words;
        somewhatPositiveWMean = somewhatPositiveWSum / words;
        neutralWMean = neutralWSum / words;
        somewhatNegativeWMean = somewhatNegativeWSum / words;
        negativeWMean = negativeWSum / words;
    }

    //System.out.println ("n:" + positiveMean  + "," +  somewhatPositiveMean  + "," + neutralMean + "," + somewhatNegativeMean + "," + negativeMean);
    //System.out.println ("w:" + positiveWMean  + "," +  somewhatPositiveWMean  + "," + neutralWMean + "," + somewhatNegativeWMean + "," + negativeWMean);

    SentimentBlock block = new SentimentBlock(sentiments, classes, positiveMean, somewhatPositiveMean,
            neutralMean, somewhatNegativeMean, negativeMean, positiveWMean, somewhatPositiveWMean, neutralWMean,
            somewhatNegativeWMean, negativeWMean, words);

    return block;
}

From source file:cc.vidr.parseviz.ParseViz.java

License:Open Source License

public static void printTreeDot(Tree tree, StringBuilder sb) {
    sb.append("graph{\nnode[shape=none];\n");
    printTreeDot(tree, sb, tree);//w w  w  .  j  a  v  a  2 s .  c  o  m
    sb.append("{rank=same;\n");
    for (Tree leaf : tree.getLeaves())
        sb.append("n").append(leaf.nodeNumber(tree)).append(";\n");
    sb.append("};\n");
    sb.append("}\n");
}

From source file:Ceist.CeistView.java

License:Open Source License

/**
 * Displays the match results in a table with the matched parts
 * formatted./*from  w  ww  . ja  va 2s. com*/
 *
 * @param m the matcher containing the match results
 * @param matchedTree the tree which was matched
 * @param showTagged whether to show POS tags or not
 * @return the HTML to be displayed in the table row
 */
private String[] getMatcherTableRow(TregexMatcher m, Tree matchedTree, boolean showTagged) {
    //List<Tree> allMatches = new ArrayList<Tree>();

    // Find matches for templates
    String strQuestion = QuestionTemplate.getQuestionString(m, txtQuestionTemplate.getText());
    String strAnswer = AnswerTemplate.getAnswerString(m, txtAnswerTemplate.getText());

    // Display the full tree in which the match was found
    String strMatchAll = "<html>";
    String lastRef = "";

    for (Tree t : matchedTree.getLeaves()) {
        String nodeValue = t.nodeString();

        if (nodeValue.startsWith("{Q")) { // This is a match for the question string
            String ref = nodeValue.substring(2, nodeValue.indexOf("}"));
            nodeValue = nodeValue.substring(nodeValue.indexOf("}") + 1);
            t.setValue(nodeValue);

            if (!ref.equals(lastRef))
                lastRef = ref;
            else
                ref = "";

            if (!showTagged)
                strMatchAll += "<sup>" + ref + "</sup><b><font color=green>" + nodeValue + "</font></b> ";
            else
                strMatchAll += "<sup>" + ref + "</sup><b><font color=green>" + nodeValue
                        + "</font><font color=gray>/" + t.parent(matchedTree).nodeString() + "</font></b> ";

        } else if (nodeValue.startsWith("{A")) { // This is a match for the answer string
            String ref = nodeValue.substring(2, nodeValue.indexOf("}"));
            nodeValue = nodeValue.substring(nodeValue.indexOf("}") + 1);
            t.setValue(nodeValue);

            if (!ref.equals(lastRef))
                lastRef = ref;
            else
                ref = "";

            if (!showTagged)
                strMatchAll += "<sup>" + ref + "</sup><b>" + nodeValue + "</b> ";
            else
                strMatchAll += "<sup>" + ref + "</sup><b>" + nodeValue + "<font color=gray>/"
                        + t.parent(matchedTree).nodeString() + "</font></b> ";
        } else { // Normal unmatched text
            if (!showTagged)
                strMatchAll += nodeValue + " ";
            else
                strMatchAll += nodeValue + "<font color=gray>/" + t.parent(matchedTree).nodeString()
                        + "</font> ";
        }
    }

    strMatchAll += "</html>";

    return new String[] { strMatchAll, strQuestion, strAnswer };

}

From source file:com.project.NLP.Requirement.PhrasesIdentification.java

/**
 * method to get the negative characters from the sentence
 *
 * @return arrayList of negative words in a sentence which are denoted by RB
 * and CC//from  w ww . jav  a2s  . com
 */
public ArrayList NegativeSentenceDetection() {
    String phraseNotation = "RB|CC";//@" + phrase + "! << @" + phrase;
    TregexPattern VBpattern = TregexPattern.compile(phraseNotation);
    TregexMatcher matcher = VBpattern.matcher(sTree);
    ArrayList negativeLists = new ArrayList();
    while (matcher.findNextMatchingNode()) {
        Tree match = matcher.getMatch();
        Tree[] innerChild = match.children();
        for (Tree inChild : innerChild) {
            negativeLists.add(inChild.getLeaves().get(0).yieldWords().get(0).word());
        }
    }
    return negativeLists;
}