Example usage for edu.stanford.nlp.parser.lexparser LexicalizedParserQuery getKBestPCFGParses

Introduction

In this page you can find the example usage for edu.stanford.nlp.parser.lexparser LexicalizedParserQuery getKBestPCFGParses.

Prototype

public List<ScoredObject<Tree>> getKBestPCFGParses(int k)

Source Link

Document

Returns the trees (and scores) corresponding to the k-best derivations of the sentence.

Usage

From source file:KleinBilingualParser.java

public static void main(String[] args) {
    boolean trainF = false;
    boolean trainE = false;
    boolean bitrainE = false;
    boolean bitrainF = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPathF = null;//from  w w  w. j ava2s. co m
    Treebank testTreebankF = null;
    Treebank tuneTreebankF = null;
    String testPathF = null;
    FileFilter testFilterF = null;
    String treebankPathE = null;
    Treebank testTreebankE = null;
    Treebank tuneTreebankE = null;
    String testPathE = null;
    FileFilter testFilterE = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilterF = null;
    FileFilter trainFilterE = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;

    String trainAlignFile = null;
    String testAlignFile = null;
    String bitrainPathE = null;
    FileFilter bitrainFilterE = null;
    String bitrainPathF = null;
    FileFilter bitrainFilterF = null;
    Treebank bitrainTreebankF = null;
    Treebank bitrainTreebankE = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info(
                "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }

    Options fOp = new Options();
    Options eOp = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encodingF = null;

    // while loop through option arguments
    while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathF = treebankDescription.first();
            trainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathF = treebankDescription.first();
            bitrainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encodingF = args[argIndex + 1];
            fOp.tlpParams.setInputEncoding(encodingF);
            fOp.tlpParams.setOutputEncoding(encodingF);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathF = treebankDescription.first();
            testFilterF = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = fOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }

        System.out.println(argIndex + " " + args.length);
    } // end while loop through arguments for french

    argIndex++;//go to english arguments

    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathE = treebankDescription.first();
            trainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathE = treebankDescription.first();
            bitrainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathE = treebankDescription.first();
            testFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-trainAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            trainAlignFile = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-testAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testAlignFile = treebankDescription.first();
        } else {
            int oldIndex = argIndex;
            argIndex = eOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    } // end while loop through arguments for english

    //    if (!train && fOp.testOptions.verbose) {
    //      StringUtils.logInvocationString(log, args);
    //    }
    LexicalizedParser lpF; // always initialized in next if-then-else block
    LexicalizedParser lpE;
    //TRAIN A PARSER
    // so we train a parser using the treebank
    GrammarCompactor compactorF = null;
    GrammarCompactor compactorE = null;
    if (fOp.trainOptions.compactGrammar() == 3) {
        compactorF = new ExactGrammarCompactor(fOp, false, false);
    }
    if (eOp.trainOptions.compactGrammar() == 3) {
        compactorE = new ExactGrammarCompactor(eOp, false, false);
    }

    Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF);
    Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE);

    fOp.testOptions.quietEvaluation = true;
    eOp.testOptions.quietEvaluation = true;

    lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF,
            null);
    lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE,
            null);

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (bitrainFilterF != null || bitrainPathF != null) {
        if (bitrainPathF == null) {
            //?
            if (treebankPathF == null) {
                throw new RuntimeException("No bitrain treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                bitrainPathF = treebankPathF;
            }
        }
        bitrainTreebankF = fOp.tlpParams.testMemoryTreebank();
        bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF);
    }

    if (bitrainFilterE != null || bitrainPathE != null) {
        if (bitrainPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                bitrainPathE = treebankPathE;
            }
        }
        bitrainTreebankE = eOp.tlpParams.testMemoryTreebank();
        bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE);
    }

    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (testFilterF != null || testPathF != null) {
        if (testPathF == null) {
            if (treebankPathF == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                testPathF = treebankPathF;
            }
        }
        testTreebankF = fOp.tlpParams.testMemoryTreebank();
        testTreebankF.loadPath(testPathF, testFilterF);
    }

    fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters()));

    if (testFilterE != null || testPathE != null) {
        if (testPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                testPathE = treebankPathE;
            }
        }
        testTreebankE = eOp.tlpParams.testMemoryTreebank();
        testTreebankE.loadPath(testPathE, testFilterE);
    }

    eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters()));

    //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX

    double[] weights = new double[8];
    double diff;
    weights[0] = 0.01;
    weights[1] = -0.002;
    weights[2] = 0.002;
    weights[3] = 0.002;
    weights[4] = 0.002;
    weights[5] = 0.002;
    weights[6] = -0.002;
    weights[7] = -0.002;

    ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null;
    ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null;
    //String alignFile="../../berkeleyaligner/output/test.align";
    try {

        AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile);
        bitrainAlignments = trainAP.createAlignments();

        AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile);
        testAlignments = testAP.createAlignments();

    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    int kE = 10;
    int kF = 10;
    int numFeatures = 8;
    int numBigSentences = 0;
    do {
        diff = 0.0;
        Iterator<Tree> eTrees = bitrainTreebankE.iterator();
        Iterator<Tree> fTrees = bitrainTreebankF.iterator();
        Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator();
        numBigSentences = 0;

        //features are used in the order they are defined
        double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF];
        int ePsGold[] = new int[bitrainTreebankE.size()];
        int fPsGold[] = new int[bitrainTreebankF.size()];

        int i = 0;
        while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) {
            HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next();
            Tree fTree = fTrees.next();
            Tree eTree = eTrees.next();

            if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) {
                //System.out.println("Too big : " + i);
                numBigSentences++;

                fPsGold[i] = 3;
                ePsGold[i] = 3;

                i++;
                continue;
            }

            List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
            List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

            LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
            LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

            lpqE.parse(sentenceE);
            lpqF.parse(sentenceF);

            List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
            List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

            fPsGold[i] = 3;
            ePsGold[i] = 3;

            int j = 0;
            int k = 0;

            for (ScoredObject<Tree> eScoredObj : kBestE) {
                k = 0;
                for (ScoredObject<Tree> fScoredObj : kBestF) {
                    eScoredObj.object().setSpans();
                    fScoredObj.object().setSpans();
                    HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(),
                            fScoredObj.object(), weights, alignMap);

                    //had to reduce likelihood scores by factor of 10 to keep the optimizer working
                    A[i][0][j][k] = eScoredObj.score() / 1000;
                    A[i][1][j][k] = fScoredObj.score() / 1000;

                    for (Map.Entry entry : alignment.entrySet()) {
                        Tree nodeF = (Tree) entry.getKey();
                        Tree nodeE = (Tree) entry.getValue();

                        A[i][2][j][k] += spanDiff(nodeF, nodeE);
                        A[i][3][j][k] += numChildren(nodeF, nodeE);
                        A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap);
                        A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                        A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                        A[i][7][j][k] += bias(nodeF, nodeE);
                    }

                    k++;
                }
                j++;
            }
            //System.out.println("Sentence " + i);
            i++;
        }

        ///////////////////////
        //
        //  MALLET optimizer
        //
        ///////////////////////
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();
        System.out.println("Beginning convex optimization...");
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();

        OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold);
        Optimizer optimizer = new LimitedMemoryBFGS(optimizable);

        boolean converged = false;

        try {
            converged = optimizer.optimize();
        } catch (IllegalArgumentException e) {
            // This exception may be thrown if L-BFGS
            //  cannot step in the current direction.
            // This condition does not necessarily mean that
            //  the optimizer has failed, but it doesn't want
            //  to claim to have succeeded...
        } catch (cc.mallet.optimize.OptimizationException e) {
            System.out.println(e.getMessage());
        }

        for (int x = 0; x < weights.length; x++) {
            diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]);
            weights[x] = optimizable.getParameter(x);
            System.out.print(weights[x] + ", ");
        }
        System.out.println();
        diff /= weights.length;

        System.out.println("Current difference: " + diff);
    } while (diff > 0.0005);

    //TESTING BILINGUAL PARSER

    Treebank bilingTestTreebankF = testTreebankF;
    Treebank bilingTestTreebankE = testTreebankE;
    Iterator<Tree> eTreesBling = testTreebankE.iterator();
    Iterator<Tree> fTreesBling = testTreebankF.iterator();

    boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages"));
    boolean runningAveragesE = Boolean.parseBoolean(eOp.testOptions.evals.getProperty("runningAverages"));

    AbstractEval pcfgLBf = new Evalb("pcfg LP/LR", runningAveragesF);
    AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF);

    AbstractEval pcfgLBe = new Evalb("pcfg LP/LR", runningAveragesE);
    AbstractEval factLBe = new Evalb("factor LP/LR", runningAveragesE);

    int i = 0;
    Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator();
    while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) {
        HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next();

        Tree fTree = fTreesBling.next();
        Tree eTree = eTreesBling.next();

        List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
        List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

        LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
        LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

        lpqE.parse(sentenceE);
        lpqF.parse(sentenceF);

        List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
        List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

        int j = 0;
        int k = 0;

        double maxScore = -Double.MAX_VALUE;
        Tree bestFtree = null;
        Tree bestEtree = null;

        for (ScoredObject<Tree> eScoredObj : kBestE) {
            k = 0;
            for (ScoredObject<Tree> fScoredObj : kBestF) {
                eScoredObj.object().setSpans();
                fScoredObj.object().setSpans();
                HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(),
                        weights, alignMap);

                double currentScore = 0.0;

                for (Map.Entry entry : alignment.entrySet()) {
                    Tree nodeF = (Tree) entry.getKey();
                    Tree nodeE = (Tree) entry.getValue();

                    currentScore += weights[0] * eScoredObj.score() / 1000;
                    currentScore += weights[1] * fScoredObj.score() / 1000;
                    currentScore += weights[2] * spanDiff(nodeF, nodeE);
                    currentScore += weights[3] * numChildren(nodeF, nodeE);
                    currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap);
                    currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                    currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                    currentScore += weights[7] * bias(nodeF, nodeE);
                }

                if (currentScore > maxScore) {
                    maxScore = currentScore;

                    bestFtree = fScoredObj.object();
                    bestEtree = eScoredObj.object();
                }

                k++;
            }
            j++;
        }
        i++;

        pcfgLBe.evaluate(bestEtree, eTree);
        factLBe.evaluate(bestEtree, eTree);

        pcfgLBf.evaluate(bestFtree, fTree);
        factLBf.evaluate(bestFtree, fTree);
    }

    System.out.println("------------------------");
    System.out.println("    English Results     ");
    System.out.println("------------------------");
    System.out.println("PCFG labeled f1: " + pcfgLBe.getEvalbF1Percent());
    System.out.println("Factored labeled f1: " + factLBe.getEvalbF1Percent());

    System.out.println("------------------------");
    System.out.println("     French Results     ");
    System.out.println("------------------------");
    System.out.println("PCFG labeled f1: " + pcfgLBf.getEvalbF1Percent());
    System.out.println("Factored labeled f1: " + factLBf.getEvalbF1Percent());
    System.out.println("------------------------");
    System.out.println("Number of sentences too big: " + numBigSentences);
}

From source file:EddyRoseDomainAdaptation.java

public static void main(String[] args) {
    boolean trainF = false;
    boolean trainE = false;
    boolean bitrainE = false;
    boolean bitrainF = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPathF = null;//  ww w .jav a2s.  co m
    Treebank testTreebankF = null;
    Treebank seqTestTreebank = null;
    Treebank tuneTreebankF = null;
    String testPathF = null;
    FileFilter testFilterF = null;
    String treebankPathE = null;
    Treebank testTreebankE = null;
    Treebank tuneTreebankE = null;
    String testPathE = null;
    FileFilter testFilterE = null;
    String seqTestPath = null;
    FileFilter seqTestFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilterF = null;
    FileFilter trainFilterE = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;

    String trainAlignFile = null;
    String testAlignFile = null;
    String bitrainPathE = null;
    FileFilter bitrainFilterE = null;
    String bitrainPathF = null;
    FileFilter bitrainFilterF = null;
    Treebank bitrainTreebankF = null;
    Treebank bitrainTreebankE = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info(
                "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }

    Options fOp = new Options();
    Options eOp = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encodingF = null;

    // while loop through option arguments
    while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathF = treebankDescription.first();
            trainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathF = treebankDescription.first();
            bitrainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encodingF = args[argIndex + 1];
            fOp.tlpParams.setInputEncoding(encodingF);
            fOp.tlpParams.setOutputEncoding(encodingF);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathF = treebankDescription.first();
            testFilterF = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = fOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }

        System.out.println(argIndex + " " + args.length);
    } // end while loop through arguments for french

    argIndex++;//go to english arguments

    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathE = treebankDescription.first();
            trainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathE = treebankDescription.first();
            bitrainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathE = treebankDescription.first();
            testFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-seqtest")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            seqTestPath = treebankDescription.first();
            seqTestFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-trainAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            trainAlignFile = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-testAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testAlignFile = treebankDescription.first();
        } else {
            int oldIndex = argIndex;
            argIndex = eOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    } // end while loop through arguments for english

    //    if (!train && fOp.testOptions.verbose) {
    //      StringUtils.logInvocationString(log, args);
    //    }

    LexicalizedParser lpF; // always initialized in next if-then-else block
    LexicalizedParser lpE;
    //TRAIN A PARSER
    // so we train a parser using the treebank

    GrammarCompactor compactorF = null;
    if (fOp.trainOptions.compactGrammar() == 3) {
        compactorF = new ExactGrammarCompactor(fOp, false, false);
    }
    Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF);
    fOp.testOptions.quietEvaluation = true;

    GrammarCompactor compactorE = null;
    if (eOp.trainOptions.compactGrammar() == 3) {
        compactorE = new ExactGrammarCompactor(eOp, false, false);
    }

    Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE);

    eOp.testOptions.quietEvaluation = true;

    lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF,
            null);
    lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE,
            null);

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (bitrainFilterF != null || bitrainPathF != null) {
        if (bitrainPathF == null) {
            //?
            if (treebankPathF == null) {
                throw new RuntimeException("No bitrain treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                bitrainPathF = treebankPathF;
            }
        }
        bitrainTreebankF = fOp.tlpParams.testMemoryTreebank();
        bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF);
    }

    if (bitrainFilterE != null || bitrainPathE != null) {
        if (bitrainPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                bitrainPathE = treebankPathE;
            }
        }
        bitrainTreebankE = eOp.tlpParams.testMemoryTreebank();
        bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE);
    }

    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (testFilterF != null || testPathF != null) {
        if (testPathF == null) {
            if (treebankPathF == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                testPathF = treebankPathF;
            }
        }
        testTreebankF = fOp.tlpParams.testMemoryTreebank();
        testTreebankF.loadPath(testPathF, testFilterF);
    }

    //generate sequioa treebank
    seqTestTreebank = fOp.tlpParams.testMemoryTreebank();
    seqTestTreebank.loadPath(seqTestPath, seqTestFilter);

    fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters()));

    if (testFilterE != null || testPathE != null) {
        if (testPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                testPathE = treebankPathE;
            }
        }
        testTreebankE = eOp.tlpParams.testMemoryTreebank();
        testTreebankE.loadPath(testPathE, testFilterE);
    }

    eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters()));

    //////////////////////
    // 
    // Self-Training
    //
    //////////////////////

    MemoryTreebank selfTrainInitTreebank = new MemoryTreebank();
    MemoryTreebank selfTrainFinalTreebank = new MemoryTreebank();

    selfTrainInitTreebank.addAll(trainTreebankF);
    selfTrainFinalTreebank.addAll(trainTreebankF);

    LexicalizedParser fSelfTrainInit = getParserFromTreebank(selfTrainInitTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);

    int z = 0;
    boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages"));
    AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF);

    for (Tree goldTree : testTreebankF) {
        List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords());
        Tree guessTree = fSelfTrainInit.parseTree(sentence);
        selfTrainFinalTreebank.add(guessTree);
        factLBf.evaluate(guessTree, goldTree);
        System.out.println("Self-training : " + (++z));
    }

    LexicalizedParser fSelfTrainFinal = getParserFromTreebank(selfTrainFinalTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluatorH = new EvaluateTreebank(fSelfTrainFinal);
    double scoreF1 = evaluatorH.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    Self Train Results  ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + scoreF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());

    //////////////////////
    //////////////////////

    //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX

    double[] weights = new double[8];
    double diff;
    weights[0] = 0.01;
    weights[1] = -0.002;
    weights[2] = 0.002;
    weights[3] = 0.002;
    weights[4] = 0.002;
    weights[5] = 0.002;
    weights[6] = -0.002;
    weights[7] = -0.002;

    ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null;
    ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null;
    //String alignFile="../../berkeleyaligner/output/test.align";
    try {

        AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile);
        bitrainAlignments = trainAP.createAlignments();

        AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile);
        testAlignments = testAP.createAlignments();

    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    int kE = 10;
    int kF = 10;
    int numFeatures = 8;
    int numBigSentences = 0;
    do {
        diff = 0.0;
        Iterator<Tree> eTrees = bitrainTreebankE.iterator();
        Iterator<Tree> fTrees = bitrainTreebankF.iterator();
        Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator();
        numBigSentences = 0;

        //features are used in the order they are defined
        double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF];
        int ePsGold[] = new int[bitrainTreebankE.size()];
        int fPsGold[] = new int[bitrainTreebankF.size()];

        int i = 0;
        while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) {
            HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next();
            Tree fTree = fTrees.next();
            Tree eTree = eTrees.next();

            if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) {
                //System.out.println("Too big : " + i);
                numBigSentences++;

                fPsGold[i] = 3;
                ePsGold[i] = 3;

                i++;
                continue;
            }

            List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
            List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

            LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
            LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

            lpqE.parse(sentenceE);
            lpqF.parse(sentenceF);

            List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
            List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

            fPsGold[i] = 3;
            ePsGold[i] = 3;

            int j = 0;
            int k = 0;

            for (ScoredObject<Tree> eScoredObj : kBestE) {
                k = 0;
                for (ScoredObject<Tree> fScoredObj : kBestF) {
                    eScoredObj.object().setSpans();
                    fScoredObj.object().setSpans();
                    HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(),
                            fScoredObj.object(), weights, alignMap);

                    //had to reduce likelihood scores by factor of 10 to keep the optimizer working
                    A[i][0][j][k] = eScoredObj.score() / 1000;
                    A[i][1][j][k] = fScoredObj.score() / 1000;

                    for (Map.Entry entry : alignment.entrySet()) {
                        Tree nodeF = (Tree) entry.getKey();
                        Tree nodeE = (Tree) entry.getValue();

                        A[i][2][j][k] += spanDiff(nodeF, nodeE);
                        A[i][3][j][k] += numChildren(nodeF, nodeE);
                        A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap);
                        A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                        A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                        A[i][7][j][k] += bias(nodeF, nodeE);
                    }

                    k++;
                }
                j++;
            }
            //System.out.println("Sentence " + i);
            i++;
        }

        ///////////////////////
        //
        //  MALLET optimizer
        //
        ///////////////////////
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();
        System.out.println("Beginning convex optimization...");
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();

        OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold);
        Optimizer optimizer = new LimitedMemoryBFGS(optimizable);

        boolean converged = false;

        try {
            converged = optimizer.optimize();
        } catch (IllegalArgumentException e) {
            // This exception may be thrown if L-BFGS
            //  cannot step in the current direction.
            // This condition does not necessarily mean that
            //  the optimizer has failed, but it doesn't want
            //  to claim to have succeeded...
        } catch (cc.mallet.optimize.OptimizationException e) {
            System.out.println(e.getMessage());
        }

        for (int x = 0; x < weights.length; x++) {
            diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]);
            weights[x] = optimizable.getParameter(x);
            System.out.print(weights[x] + ", ");
        }
        System.out.println();
        diff /= weights.length;

        System.out.println("Current difference: " + diff);
    } while (diff > 0.0005);

    //GENERATE TRAINING DATA USING KLEIN RERANKER
    //assumes the 'test' data from KleinBilingualParser.java is the unannotated data
    //that the reranker has to annotate.

    factLBf = new Evalb("factor LP/LR", runningAveragesF);
    MemoryTreebank eddyRoseFullTrainTreebank = new MemoryTreebank();
    eddyRoseFullTrainTreebank.addAll(trainTreebankF);
    eddyRoseFullTrainTreebank.addAll(bitrainTreebankF);

    Treebank unannotTreebankF = testTreebankF;
    Treebank annotTreebankE = testTreebankE;
    Iterator<Tree> eTreesBling = unannotTreebankF.iterator();
    Iterator<Tree> fTreesBling = annotTreebankE.iterator();

    int i = 0;
    Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator();
    while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) {
        HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next();

        Tree fTree = fTreesBling.next();
        Tree eTree = eTreesBling.next();

        List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
        LexicalizedParserQuery lpqF = (LexicalizedParserQuery) fSelfTrainFinal.parserQuery();
        lpqF.parse(sentenceF);
        List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);

        int j = 0;
        int k = 0;

        double maxScore = -Double.MAX_VALUE;
        Tree bestFtree = null;

        for (ScoredObject<Tree> fScoredObj : kBestF) {
            eTree.setSpans();
            fScoredObj.object().setSpans();
            HashMap<Tree, Tree> alignment = getHungarianAlignment(eTree, fScoredObj.object(), weights,
                    alignMap);

            double currentScore = 0.0;

            for (Map.Entry entry : alignment.entrySet()) {
                Tree nodeF = (Tree) entry.getKey();
                Tree nodeE = (Tree) entry.getValue();

                currentScore += weights[0] * 0.0;//because gold standard tree is assumed to have probability 1
                currentScore += weights[1] * fScoredObj.score() / 1000;
                currentScore += weights[2] * spanDiff(nodeF, nodeE);
                currentScore += weights[3] * numChildren(nodeF, nodeE);
                currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap);
                currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                currentScore += weights[7] * bias(nodeF, nodeE);
            }

            if (currentScore > maxScore) {
                maxScore = currentScore;

                bestFtree = fScoredObj.object();
            }

            k++;
        }
        i++;

        System.out.println("Reranker " + i);
        eddyRoseFullTrainTreebank.add(bestFtree);
        factLBf.evaluate(bestFtree, fTree);
    }

    LexicalizedParser lpEddyRose = getParserFromTreebank(eddyRoseFullTrainTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluator = new EvaluateTreebank(lpEddyRose);
    double eddyRoseF1 = evaluator.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    EddyRose Results    ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + eddyRoseF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());
}

From source file:com.github.kutschkem.Qgen.QuestionRankerByParseProbability.java

License:Open Source License

public List<Question> rank(List<Question> questions) {

    System.out.println("afterPipeline:" + questions.size());

    final Map<Question, Double> scores = new HashMap<Question, Double>();
    for (Question q : questions) {
        List<HasWord> tokens = new DocumentPreprocessor(new StringReader(q.Question)).iterator().next();

        LexicalizedParserQuery query = parser.parserQuery();
        query.parse(tokens);/*from  w w  w .  j  a v  a2 s .c om*/
        scores.put(q, average(query.getKBestPCFGParses(3)));
    }

    List<Question> result = new ArrayList<Question>(questions);

    Collections.sort(result, new Comparator<Question>() {

        public int compare(Question o1, Question o2) {
            return -scores.get(o1).compareTo(scores.get(o2));
        }

    });

    for (Question q : result) {
        System.out.println(q.Question + " " + scores.get(q));
    }

    return result;
}