Example usage for edu.stanford.nlp.trees Tree setSpans

List of usage examples for edu.stanford.nlp.trees Tree setSpans

Introduction

In this page you can find the example usage for edu.stanford.nlp.trees Tree setSpans.

Prototype

public void setSpans() 

Source Link

Document

Assign a SpanAnnotation on each node of this tree.

Usage

From source file:EddyRoseDomainAdaptation.java

public static void main(String[] args) {
    boolean trainF = false;
    boolean trainE = false;
    boolean bitrainE = false;
    boolean bitrainF = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPathF = null;// w  w  w  .  java2s .  c om
    Treebank testTreebankF = null;
    Treebank seqTestTreebank = null;
    Treebank tuneTreebankF = null;
    String testPathF = null;
    FileFilter testFilterF = null;
    String treebankPathE = null;
    Treebank testTreebankE = null;
    Treebank tuneTreebankE = null;
    String testPathE = null;
    FileFilter testFilterE = null;
    String seqTestPath = null;
    FileFilter seqTestFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilterF = null;
    FileFilter trainFilterE = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;

    String trainAlignFile = null;
    String testAlignFile = null;
    String bitrainPathE = null;
    FileFilter bitrainFilterE = null;
    String bitrainPathF = null;
    FileFilter bitrainFilterF = null;
    Treebank bitrainTreebankF = null;
    Treebank bitrainTreebankE = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info(
                "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }

    Options fOp = new Options();
    Options eOp = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encodingF = null;

    // while loop through option arguments
    while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathF = treebankDescription.first();
            trainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathF = treebankDescription.first();
            bitrainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encodingF = args[argIndex + 1];
            fOp.tlpParams.setInputEncoding(encodingF);
            fOp.tlpParams.setOutputEncoding(encodingF);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathF = treebankDescription.first();
            testFilterF = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = fOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }

        System.out.println(argIndex + " " + args.length);
    } // end while loop through arguments for french

    argIndex++;//go to english arguments

    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathE = treebankDescription.first();
            trainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathE = treebankDescription.first();
            bitrainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathE = treebankDescription.first();
            testFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-seqtest")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            seqTestPath = treebankDescription.first();
            seqTestFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-trainAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            trainAlignFile = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-testAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testAlignFile = treebankDescription.first();
        } else {
            int oldIndex = argIndex;
            argIndex = eOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    } // end while loop through arguments for english

    //    if (!train && fOp.testOptions.verbose) {
    //      StringUtils.logInvocationString(log, args);
    //    }

    LexicalizedParser lpF; // always initialized in next if-then-else block
    LexicalizedParser lpE;
    //TRAIN A PARSER
    // so we train a parser using the treebank

    GrammarCompactor compactorF = null;
    if (fOp.trainOptions.compactGrammar() == 3) {
        compactorF = new ExactGrammarCompactor(fOp, false, false);
    }
    Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF);
    fOp.testOptions.quietEvaluation = true;

    GrammarCompactor compactorE = null;
    if (eOp.trainOptions.compactGrammar() == 3) {
        compactorE = new ExactGrammarCompactor(eOp, false, false);
    }

    Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE);

    eOp.testOptions.quietEvaluation = true;

    lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF,
            null);
    lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE,
            null);

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (bitrainFilterF != null || bitrainPathF != null) {
        if (bitrainPathF == null) {
            //?
            if (treebankPathF == null) {
                throw new RuntimeException("No bitrain treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                bitrainPathF = treebankPathF;
            }
        }
        bitrainTreebankF = fOp.tlpParams.testMemoryTreebank();
        bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF);
    }

    if (bitrainFilterE != null || bitrainPathE != null) {
        if (bitrainPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                bitrainPathE = treebankPathE;
            }
        }
        bitrainTreebankE = eOp.tlpParams.testMemoryTreebank();
        bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE);
    }

    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (testFilterF != null || testPathF != null) {
        if (testPathF == null) {
            if (treebankPathF == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                testPathF = treebankPathF;
            }
        }
        testTreebankF = fOp.tlpParams.testMemoryTreebank();
        testTreebankF.loadPath(testPathF, testFilterF);
    }

    //generate sequioa treebank
    seqTestTreebank = fOp.tlpParams.testMemoryTreebank();
    seqTestTreebank.loadPath(seqTestPath, seqTestFilter);

    fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters()));

    if (testFilterE != null || testPathE != null) {
        if (testPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                testPathE = treebankPathE;
            }
        }
        testTreebankE = eOp.tlpParams.testMemoryTreebank();
        testTreebankE.loadPath(testPathE, testFilterE);
    }

    eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters()));

    //////////////////////
    // 
    // Self-Training
    //
    //////////////////////

    MemoryTreebank selfTrainInitTreebank = new MemoryTreebank();
    MemoryTreebank selfTrainFinalTreebank = new MemoryTreebank();

    selfTrainInitTreebank.addAll(trainTreebankF);
    selfTrainFinalTreebank.addAll(trainTreebankF);

    LexicalizedParser fSelfTrainInit = getParserFromTreebank(selfTrainInitTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);

    int z = 0;
    boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages"));
    AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF);

    for (Tree goldTree : testTreebankF) {
        List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords());
        Tree guessTree = fSelfTrainInit.parseTree(sentence);
        selfTrainFinalTreebank.add(guessTree);
        factLBf.evaluate(guessTree, goldTree);
        System.out.println("Self-training : " + (++z));
    }

    LexicalizedParser fSelfTrainFinal = getParserFromTreebank(selfTrainFinalTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluatorH = new EvaluateTreebank(fSelfTrainFinal);
    double scoreF1 = evaluatorH.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    Self Train Results  ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + scoreF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());

    //////////////////////
    //////////////////////

    //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX

    double[] weights = new double[8];
    double diff;
    weights[0] = 0.01;
    weights[1] = -0.002;
    weights[2] = 0.002;
    weights[3] = 0.002;
    weights[4] = 0.002;
    weights[5] = 0.002;
    weights[6] = -0.002;
    weights[7] = -0.002;

    ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null;
    ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null;
    //String alignFile="../../berkeleyaligner/output/test.align";
    try {

        AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile);
        bitrainAlignments = trainAP.createAlignments();

        AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile);
        testAlignments = testAP.createAlignments();

    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    int kE = 10;
    int kF = 10;
    int numFeatures = 8;
    int numBigSentences = 0;
    do {
        diff = 0.0;
        Iterator<Tree> eTrees = bitrainTreebankE.iterator();
        Iterator<Tree> fTrees = bitrainTreebankF.iterator();
        Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator();
        numBigSentences = 0;

        //features are used in the order they are defined
        double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF];
        int ePsGold[] = new int[bitrainTreebankE.size()];
        int fPsGold[] = new int[bitrainTreebankF.size()];

        int i = 0;
        while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) {
            HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next();
            Tree fTree = fTrees.next();
            Tree eTree = eTrees.next();

            if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) {
                //System.out.println("Too big : " + i);
                numBigSentences++;

                fPsGold[i] = 3;
                ePsGold[i] = 3;

                i++;
                continue;
            }

            List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
            List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

            LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
            LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

            lpqE.parse(sentenceE);
            lpqF.parse(sentenceF);

            List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
            List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

            fPsGold[i] = 3;
            ePsGold[i] = 3;

            int j = 0;
            int k = 0;

            for (ScoredObject<Tree> eScoredObj : kBestE) {
                k = 0;
                for (ScoredObject<Tree> fScoredObj : kBestF) {
                    eScoredObj.object().setSpans();
                    fScoredObj.object().setSpans();
                    HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(),
                            fScoredObj.object(), weights, alignMap);

                    //had to reduce likelihood scores by factor of 10 to keep the optimizer working
                    A[i][0][j][k] = eScoredObj.score() / 1000;
                    A[i][1][j][k] = fScoredObj.score() / 1000;

                    for (Map.Entry entry : alignment.entrySet()) {
                        Tree nodeF = (Tree) entry.getKey();
                        Tree nodeE = (Tree) entry.getValue();

                        A[i][2][j][k] += spanDiff(nodeF, nodeE);
                        A[i][3][j][k] += numChildren(nodeF, nodeE);
                        A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap);
                        A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                        A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                        A[i][7][j][k] += bias(nodeF, nodeE);
                    }

                    k++;
                }
                j++;
            }
            //System.out.println("Sentence " + i);
            i++;
        }

        ///////////////////////
        //
        //  MALLET optimizer
        //
        ///////////////////////
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();
        System.out.println("Beginning convex optimization...");
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();

        OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold);
        Optimizer optimizer = new LimitedMemoryBFGS(optimizable);

        boolean converged = false;

        try {
            converged = optimizer.optimize();
        } catch (IllegalArgumentException e) {
            // This exception may be thrown if L-BFGS
            //  cannot step in the current direction.
            // This condition does not necessarily mean that
            //  the optimizer has failed, but it doesn't want
            //  to claim to have succeeded...
        } catch (cc.mallet.optimize.OptimizationException e) {
            System.out.println(e.getMessage());
        }

        for (int x = 0; x < weights.length; x++) {
            diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]);
            weights[x] = optimizable.getParameter(x);
            System.out.print(weights[x] + ", ");
        }
        System.out.println();
        diff /= weights.length;

        System.out.println("Current difference: " + diff);
    } while (diff > 0.0005);

    //GENERATE TRAINING DATA USING KLEIN RERANKER
    //assumes the 'test' data from KleinBilingualParser.java is the unannotated data
    //that the reranker has to annotate.

    factLBf = new Evalb("factor LP/LR", runningAveragesF);
    MemoryTreebank eddyRoseFullTrainTreebank = new MemoryTreebank();
    eddyRoseFullTrainTreebank.addAll(trainTreebankF);
    eddyRoseFullTrainTreebank.addAll(bitrainTreebankF);

    Treebank unannotTreebankF = testTreebankF;
    Treebank annotTreebankE = testTreebankE;
    Iterator<Tree> eTreesBling = unannotTreebankF.iterator();
    Iterator<Tree> fTreesBling = annotTreebankE.iterator();

    int i = 0;
    Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator();
    while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) {
        HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next();

        Tree fTree = fTreesBling.next();
        Tree eTree = eTreesBling.next();

        List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
        LexicalizedParserQuery lpqF = (LexicalizedParserQuery) fSelfTrainFinal.parserQuery();
        lpqF.parse(sentenceF);
        List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);

        int j = 0;
        int k = 0;

        double maxScore = -Double.MAX_VALUE;
        Tree bestFtree = null;

        for (ScoredObject<Tree> fScoredObj : kBestF) {
            eTree.setSpans();
            fScoredObj.object().setSpans();
            HashMap<Tree, Tree> alignment = getHungarianAlignment(eTree, fScoredObj.object(), weights,
                    alignMap);

            double currentScore = 0.0;

            for (Map.Entry entry : alignment.entrySet()) {
                Tree nodeF = (Tree) entry.getKey();
                Tree nodeE = (Tree) entry.getValue();

                currentScore += weights[0] * 0.0;//because gold standard tree is assumed to have probability 1
                currentScore += weights[1] * fScoredObj.score() / 1000;
                currentScore += weights[2] * spanDiff(nodeF, nodeE);
                currentScore += weights[3] * numChildren(nodeF, nodeE);
                currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap);
                currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                currentScore += weights[7] * bias(nodeF, nodeE);
            }

            if (currentScore > maxScore) {
                maxScore = currentScore;

                bestFtree = fScoredObj.object();
            }

            k++;
        }
        i++;

        System.out.println("Reranker " + i);
        eddyRoseFullTrainTreebank.add(bestFtree);
        factLBf.evaluate(bestFtree, fTree);
    }

    LexicalizedParser lpEddyRose = getParserFromTreebank(eddyRoseFullTrainTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluator = new EvaluateTreebank(lpEddyRose);
    double eddyRoseF1 = evaluator.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    EddyRose Results    ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + eddyRoseF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());
}

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro.java

License:Open Source License

public static void convertConstituents(JCas aJCas, Annotation aDocument, MappingProvider aMappingProvider,
        boolean aInternStrings, TreebankLanguagePack aTreebankLanguagePack) {
    for (CoreMap s : aDocument.get(SentencesAnnotation.class)) {
        Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
        tree.setSpans();
        List<CoreLabel> tokens = s.get(TokensAnnotation.class);
        convertConstituentTreeNode(aJCas, aTreebankLanguagePack, tree, null, aInternStrings, aMappingProvider,
                tokens);/*from w  ww. j av a  2s  . c  om*/
    }

}