Example usage for edu.stanford.nlp.trees MemoryTreebank MemoryTreebank

Introduction

In this page you can find the example usage for edu.stanford.nlp.trees MemoryTreebank MemoryTreebank.

Prototype

public MemoryTreebank()

Source Link

Document

Create a new tree bank.

Usage

From source file:LexicalizedParserUnsupervisedDA.java

public static void main(String[] args) {
    boolean seed = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPath = null;//from www  . j  a  v a  2  s  . c  om
    Treebank selfTrainTreebank = null;
    MemoryTreebank finalTrainTreebank = null;
    Treebank tuneTreebank = null;
    String testPath = null;
    String inTestPath = null;
    String selfTrainPath = null;
    FileFilter testFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilter = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info(
                "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }

    Options op = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encoding = null;
    // while loop through option arguments
    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-inTest")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-inTest");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            inTestPath = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPath = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-seed")) {
            seed = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-seed");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPath = treebankDescription.first();
            trainFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-train2")) {
            // train = true;     // cdm july 2005: should require -train for this
            Triple<String, FileFilter, Double> treebankDescription = ArgUtils
                    .getWeightedTreebankDescription(args, argIndex, "-train2");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            secondaryTreebankPath = treebankDescription.first();
            secondaryTrainFilter = treebankDescription.second();
            secondaryTreebankWeight = treebankDescription.third();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encoding = args[argIndex + 1];
            op.tlpParams.setInputEncoding(encoding);
            op.tlpParams.setOutputEncoding(encoding);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenized")) {
            tokenized = true;
            argIndex += 1;
        } else if (args[argIndex].equalsIgnoreCase("-escaper")) {
            try {
                escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]);
            } catch (Exception e) {
                log.info("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {
            tokenizerOptions = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {
            tokenizerFactoryClass = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {
            tokenizerMethod = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-sentences")) {
            sentenceDelimiter = args[argIndex + 1];
            if (sentenceDelimiter.equalsIgnoreCase("newline")) {
                sentenceDelimiter = "\n";
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parseInside")) {
            elementDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {
            tagDelimiter = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")
                || args[argIndex].equalsIgnoreCase("-model")) {
            // load the parser from a binary serialized file
            // the next argument must be the path to the parser file
            serializedInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
            // load the parser from declarative text file
            // the next argument must be the path to the parser file
            textInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
            saveToSerializedFile = true;
            if (ArgUtils.numSubArgs(args, argIndex) < 1) {
                log.info("Missing path: -saveToSerialized filename");
            } else {
                serializedOutputFileOrUrl = args[argIndex + 1];
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
            // save the parser to declarative text file
            saveToTextFile = true;
            textOutputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) {
            // save the training trees to a binary file
            op.trainOptions.trainTreeFile = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-selfTrain")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-selfTrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            selfTrainPath = treebankDescription.first();
            testFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tune")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-tune");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            tunePath = treebankDescription.first();
            tuneFilter = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = op.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    } // end while loop through arguments

    // all other arguments are order dependent and
    // are processed in order below

    if (tuneFilter != null || tunePath != null) {
        if (tunePath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No tune treebank path specified...");
            } else {
                log.info("No tune treebank path specified.  Using train path: \"" + treebankPath + '\"');
                tunePath = treebankPath;
            }
        }
        tuneTreebank = op.tlpParams.testMemoryTreebank();
        tuneTreebank.loadPath(tunePath, tuneFilter);
    }

    //    if (!train && op.testOptions.verbose) {
    //      StringUtils.logInvocationString(log, args);
    //    }
    LexicalizedParser lp; // always initialized in next if-then-else block
    if (seed) {
        //StringUtils.logInvocationString(log, args);

        // so we train a parser using the treebank
        GrammarCompactor compactor = null;
        if (op.trainOptions.compactGrammar() == 3) {
            compactor = new ExactGrammarCompactor(op, false, false);
        }

        Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
        finalTrainTreebank = new MemoryTreebank();
        finalTrainTreebank.addAll(trainTreebank);

        Treebank secondaryTrainTreebank = null;
        if (secondaryTreebankPath != null) {
            secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter);
        }

        List<List<TaggedWord>> extraTaggedWords = null;
        if (op.trainOptions.taggedFiles != null) {
            extraTaggedWords = new ArrayList<>();
            List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(),
                    op.trainOptions.taggedFiles);
            for (TaggedFileRecord record : fileRecords) {
                for (List<TaggedWord> sentence : record.reader()) {
                    extraTaggedWords.add(sentence);
                }
            }
        }

        op.testOptions.quietEvaluation = true;
        lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor,
                op, tuneTreebank, extraTaggedWords);
    } else if (textInputFileOrUrl != null) {
        // so we load the parser from a text grammar file
        lp = getParserFromTextFile(textInputFileOrUrl, op);
    } else {
        // so we load a serialized parser - 
        if (serializedInputFileOrUrl == null && argIndex < args.length) {
            // the next argument must be the path to the serialized parser
            serializedInputFileOrUrl = args[argIndex];
            argIndex++;
        }
        if (serializedInputFileOrUrl == null) {
            log.info("No grammar specified, exiting...");
            return;
        }
        String[] extraArgs = new String[optionArgs.size()];
        extraArgs = optionArgs.toArray(extraArgs);
        try {
            lp = loadModel(serializedInputFileOrUrl, op, extraArgs);
            op.setOptions(extraArgs);//CHANGED
        } catch (IllegalArgumentException e) {
            log.info("Error loading parser, exiting...");
            throw e;
        }
    }

    // set up tokenizerFactory with options if provided
    if (tokenizerFactoryClass != null || tokenizerOptions != null) {
        try {
            if (tokenizerFactoryClass != null) {
                Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils
                        .uncheckedCast(Class.forName(tokenizerFactoryClass));
                Method factoryMethod;
                if (tokenizerOptions != null) {
                    factoryMethod = clazz.getMethod(
                            tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory",
                            String.class);
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions));
                } else {
                    factoryMethod = clazz
                            .getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory");
                    tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null));
                }
            } else {
                // have options but no tokenizer factory.  use the parser
                // langpack's factory and set its options
                tokenizerFactory = op.langpack().getTokenizerFactory();
                tokenizerFactory.setOptions(tokenizerOptions);
            }
        } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException
                | NoSuchMethodException e) {
            log.info("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options "
                    + tokenizerOptions);
            throw new RuntimeException(e);
        }
    }

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encoding != null) {
        op.tlpParams.setInputEncoding(encoding);
        op.tlpParams.setOutputEncoding(encoding);
    }

    if (testFilter != null || selfTrainPath != null) {
        if (selfTrainPath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPath + '\"');
                selfTrainPath = treebankPath;
            }
        }
        selfTrainTreebank = op.tlpParams.testMemoryTreebank();
        selfTrainTreebank.loadPath(selfTrainPath, testFilter);
    }

    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));

    // at this point we should be sure that op.tlpParams is
    // set appropriately (from command line, or from grammar file),
    // and will never change again.  -- Roger

    // Now what do we do with the parser we've made
    if (saveToTextFile) {
        // save the parser to textGrammar format
        if (textOutputFileOrUrl != null) {
            lp.saveParserToTextFile(textOutputFileOrUrl);
        } else {
            log.info("Usage: must specify a text grammar output path");
        }
    }
    if (saveToSerializedFile) {
        if (serializedOutputFileOrUrl != null) {
            lp.saveParserToSerialized(serializedOutputFileOrUrl);
        } else if (textOutputFileOrUrl == null && selfTrainTreebank == null) {
            // no saving/parsing request has been specified
            log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser "
                    + "-seed trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename");
        }
    }

    if (op.testOptions.verbose || seed) {
        // Tell the user a little or a lot about what we have made
        // get lexicon size separately as it may have its own prints in it....
        String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : "";
        log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
        log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size()
                + '\t' + (lp.ug != null ? lp.ug.numRules() : "") + '\t'
                + (lp.bg != null ? lp.bg.numRules() : "") + '\t' + lexNumRules);
        log.info("ParserPack is " + op.tlpParams.getClass().getName());
        log.info("Lexicon is " + lp.lex.getClass().getName());
        if (op.testOptions.verbose) {
            log.info("Tags are: " + lp.tagIndex);
            // log.info("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)!
        }
        printOptions(false, op);
    }

    if (selfTrainTreebank != null) {
        Treebank selfTrainTest = makeTreebank(testPath, op, null);
        Treebank inTest = makeTreebank(inTestPath, op, null);
        EvaluateTreebank evaluator = new EvaluateTreebank(lp);
        double baseLineOutDomain = evaluator.testOnTreebank(selfTrainTest);
        double baseLineInDomain = evaluator.testOnTreebank(inTest);
        // annotate unlabeled data
        System.out.println("Starting selftraining...");
        int i = 0;
        for (Tree goldTree : selfTrainTreebank) {
            List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords());
            ;
            finalTrainTreebank.add(lp.parseTree(sentence));
            System.out.println("Self-training : " + (++i));
        }
        System.out.println("Finished creating the final dataset");
        GrammarCompactor compactor = null;
        if (op.trainOptions.compactGrammar() == 3) {
            compactor = new ExactGrammarCompactor(op, false, false);
        }
        op.testOptions.quietEvaluation = true;
        lp = getParserFromTreebank(finalTrainTreebank, null, 1.0, compactor, op, tuneTreebank, null);

        evaluator = new EvaluateTreebank(lp);
        double finalF1 = evaluator.testOnTreebank(selfTrainTest);

        System.out.println("------------------------");
        System.out.println("The results that matter:");
        System.out.println("------------------------");
        System.out.println("Baseline In Domain F1 : " + baseLineInDomain);
        System.out.println("Baseline Out Domain F1 : " + baseLineOutDomain);
        System.out.println("Self-Trained Out Domain F1 : " + finalF1);

    } else if (argIndex >= args.length) {
        // no more arguments, so we just parse our own test sentence
        PrintWriter pwOut = op.tlpParams.pw();
        PrintWriter pwErr = op.tlpParams.pw(System.err);
        ParserQuery pq = lp.parserQuery();
        if (pq.parse(op.tlpParams.defaultTestSentence())) {
            lp.getTreePrint().printTree(pq.getBestParse(), pwOut);
        } else {
            pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence());
        }
    } else {
        // We parse filenames given by the remaining arguments
        ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter,
                escaper, tagDelimiter, op, lp.getTreePrint(), lp);
    }

}

From source file:EddyRoseDomainAdaptation.java

public static void main(String[] args) {
    boolean trainF = false;
    boolean trainE = false;
    boolean bitrainE = false;
    boolean bitrainF = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPathF = null;/*from  w  ww.  j  ava2s  . c om*/
    Treebank testTreebankF = null;
    Treebank seqTestTreebank = null;
    Treebank tuneTreebankF = null;
    String testPathF = null;
    FileFilter testFilterF = null;
    String treebankPathE = null;
    Treebank testTreebankE = null;
    Treebank tuneTreebankE = null;
    String testPathE = null;
    FileFilter testFilterE = null;
    String seqTestPath = null;
    FileFilter seqTestFilter = null;
    String tunePath = null;
    FileFilter tuneFilter = null;
    FileFilter trainFilterF = null;
    FileFilter trainFilterE = null;
    String secondaryTreebankPath = null;
    double secondaryTreebankWeight = 1.0;
    FileFilter secondaryTrainFilter = null;

    String trainAlignFile = null;
    String testAlignFile = null;
    String bitrainPathE = null;
    FileFilter bitrainFilterE = null;
    String bitrainPathF = null;
    FileFilter bitrainFilterF = null;
    Treebank bitrainTreebankF = null;
    Treebank bitrainTreebankE = null;

    // variables needed to process the files to be parsed
    TokenizerFactory<? extends HasWord> tokenizerFactory = null;
    String tokenizerOptions = null;
    String tokenizerFactoryClass = null;
    String tokenizerMethod = null;
    boolean tokenized = false; // whether or not the input file has already been tokenized
    Function<List<HasWord>, List<HasWord>> escaper = null;
    String tagDelimiter = null;
    String sentenceDelimiter = null;
    String elementDelimiter = null;
    int argIndex = 0;
    if (args.length < 1) {
        log.info(
                "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
        return;
    }

    Options fOp = new Options();
    Options eOp = new Options();
    List<String> optionArgs = new ArrayList<>();
    String encodingF = null;

    // while loop through option arguments
    while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathF = treebankDescription.first();
            trainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainF = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathF = treebankDescription.first();
            bitrainFilterF = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
            try {
                fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[argIndex + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("Illegal access" + e);
                throw new RuntimeException(e);
            }
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            // redone later to override any serialized parser one read in
            encodingF = args[argIndex + 1];
            fOp.tlpParams.setInputEncoding(encodingF);
            fOp.tlpParams.setOutputEncoding(encodingF);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathF = treebankDescription.first();
            testFilterF = treebankDescription.second();
        } else {
            int oldIndex = argIndex;
            argIndex = fOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }

        System.out.println(argIndex + " " + args.length);
    } // end while loop through arguments for french

    argIndex++;//go to english arguments

    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {
            trainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-train");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            treebankPathE = treebankDescription.first();
            trainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-bitrain")
                || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) {
            bitrainE = true;
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-bitrain");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            bitrainPathE = treebankDescription.first();
            bitrainFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-treebank")
                || args[argIndex].equalsIgnoreCase("-testTreebank")
                || args[argIndex].equalsIgnoreCase("-test")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testPathE = treebankDescription.first();
            testFilterE = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-seqtest")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-test");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            seqTestPath = treebankDescription.first();
            seqTestFilter = treebankDescription.second();
        } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-trainAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            trainAlignFile = treebankDescription.first();
        } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex,
                    "-testAlignFile");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testAlignFile = treebankDescription.first();
        } else {
            int oldIndex = argIndex;
            argIndex = eOp.setOptionOrWarn(args, argIndex);
            optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex));
        }
    } // end while loop through arguments for english

    //    if (!train && fOp.testOptions.verbose) {
    //      StringUtils.logInvocationString(log, args);
    //    }

    LexicalizedParser lpF; // always initialized in next if-then-else block
    LexicalizedParser lpE;
    //TRAIN A PARSER
    // so we train a parser using the treebank

    GrammarCompactor compactorF = null;
    if (fOp.trainOptions.compactGrammar() == 3) {
        compactorF = new ExactGrammarCompactor(fOp, false, false);
    }
    Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF);
    fOp.testOptions.quietEvaluation = true;

    GrammarCompactor compactorE = null;
    if (eOp.trainOptions.compactGrammar() == 3) {
        compactorE = new ExactGrammarCompactor(eOp, false, false);
    }

    Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE);

    eOp.testOptions.quietEvaluation = true;

    lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF,
            null);
    lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE,
            null);

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
    // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (bitrainFilterF != null || bitrainPathF != null) {
        if (bitrainPathF == null) {
            //?
            if (treebankPathF == null) {
                throw new RuntimeException("No bitrain treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                bitrainPathF = treebankPathF;
            }
        }
        bitrainTreebankF = fOp.tlpParams.testMemoryTreebank();
        bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF);
    }

    if (bitrainFilterE != null || bitrainPathE != null) {
        if (bitrainPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                bitrainPathE = treebankPathE;
            }
        }
        bitrainTreebankE = eOp.tlpParams.testMemoryTreebank();
        bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE);
    }

    if (encodingF != null) {
        fOp.tlpParams.setInputEncoding(encodingF);
        fOp.tlpParams.setOutputEncoding(encodingF);
    }

    if (testFilterF != null || testPathF != null) {
        if (testPathF == null) {
            if (treebankPathF == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathF + '\"');
                testPathF = treebankPathF;
            }
        }
        testTreebankF = fOp.tlpParams.testMemoryTreebank();
        testTreebankF.loadPath(testPathF, testFilterF);
    }

    //generate sequioa treebank
    seqTestTreebank = fOp.tlpParams.testMemoryTreebank();
    seqTestTreebank.loadPath(seqTestPath, seqTestFilter);

    fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters()));

    if (testFilterE != null || testPathE != null) {
        if (testPathE == null) {
            if (treebankPathE == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPathE + '\"');
                testPathE = treebankPathE;
            }
        }
        testTreebankE = eOp.tlpParams.testMemoryTreebank();
        testTreebankE.loadPath(testPathE, testFilterE);
    }

    eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters()));

    //////////////////////
    // 
    // Self-Training
    //
    //////////////////////

    MemoryTreebank selfTrainInitTreebank = new MemoryTreebank();
    MemoryTreebank selfTrainFinalTreebank = new MemoryTreebank();

    selfTrainInitTreebank.addAll(trainTreebankF);
    selfTrainFinalTreebank.addAll(trainTreebankF);

    LexicalizedParser fSelfTrainInit = getParserFromTreebank(selfTrainInitTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);

    int z = 0;
    boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages"));
    AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF);

    for (Tree goldTree : testTreebankF) {
        List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords());
        Tree guessTree = fSelfTrainInit.parseTree(sentence);
        selfTrainFinalTreebank.add(guessTree);
        factLBf.evaluate(guessTree, goldTree);
        System.out.println("Self-training : " + (++z));
    }

    LexicalizedParser fSelfTrainFinal = getParserFromTreebank(selfTrainFinalTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluatorH = new EvaluateTreebank(fSelfTrainFinal);
    double scoreF1 = evaluatorH.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    Self Train Results  ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + scoreF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());

    //////////////////////
    //////////////////////

    //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX

    double[] weights = new double[8];
    double diff;
    weights[0] = 0.01;
    weights[1] = -0.002;
    weights[2] = 0.002;
    weights[3] = 0.002;
    weights[4] = 0.002;
    weights[5] = 0.002;
    weights[6] = -0.002;
    weights[7] = -0.002;

    ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null;
    ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null;
    //String alignFile="../../berkeleyaligner/output/test.align";
    try {

        AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile);
        bitrainAlignments = trainAP.createAlignments();

        AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile);
        testAlignments = testAP.createAlignments();

    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    int kE = 10;
    int kF = 10;
    int numFeatures = 8;
    int numBigSentences = 0;
    do {
        diff = 0.0;
        Iterator<Tree> eTrees = bitrainTreebankE.iterator();
        Iterator<Tree> fTrees = bitrainTreebankF.iterator();
        Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator();
        numBigSentences = 0;

        //features are used in the order they are defined
        double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF];
        int ePsGold[] = new int[bitrainTreebankE.size()];
        int fPsGold[] = new int[bitrainTreebankF.size()];

        int i = 0;
        while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) {
            HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next();
            Tree fTree = fTrees.next();
            Tree eTree = eTrees.next();

            if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) {
                //System.out.println("Too big : " + i);
                numBigSentences++;

                fPsGold[i] = 3;
                ePsGold[i] = 3;

                i++;
                continue;
            }

            List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
            List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords());

            LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery();
            LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery();

            lpqE.parse(sentenceE);
            lpqF.parse(sentenceF);

            List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);
            List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE);

            fPsGold[i] = 3;
            ePsGold[i] = 3;

            int j = 0;
            int k = 0;

            for (ScoredObject<Tree> eScoredObj : kBestE) {
                k = 0;
                for (ScoredObject<Tree> fScoredObj : kBestF) {
                    eScoredObj.object().setSpans();
                    fScoredObj.object().setSpans();
                    HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(),
                            fScoredObj.object(), weights, alignMap);

                    //had to reduce likelihood scores by factor of 10 to keep the optimizer working
                    A[i][0][j][k] = eScoredObj.score() / 1000;
                    A[i][1][j][k] = fScoredObj.score() / 1000;

                    for (Map.Entry entry : alignment.entrySet()) {
                        Tree nodeF = (Tree) entry.getKey();
                        Tree nodeE = (Tree) entry.getValue();

                        A[i][2][j][k] += spanDiff(nodeF, nodeE);
                        A[i][3][j][k] += numChildren(nodeF, nodeE);
                        A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap);
                        A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                        A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                        A[i][7][j][k] += bias(nodeF, nodeE);
                    }

                    k++;
                }
                j++;
            }
            //System.out.println("Sentence " + i);
            i++;
        }

        ///////////////////////
        //
        //  MALLET optimizer
        //
        ///////////////////////
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();
        System.out.println("Beginning convex optimization...");
        System.out.println();
        System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*");
        System.out.println();

        OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold);
        Optimizer optimizer = new LimitedMemoryBFGS(optimizable);

        boolean converged = false;

        try {
            converged = optimizer.optimize();
        } catch (IllegalArgumentException e) {
            // This exception may be thrown if L-BFGS
            //  cannot step in the current direction.
            // This condition does not necessarily mean that
            //  the optimizer has failed, but it doesn't want
            //  to claim to have succeeded...
        } catch (cc.mallet.optimize.OptimizationException e) {
            System.out.println(e.getMessage());
        }

        for (int x = 0; x < weights.length; x++) {
            diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]);
            weights[x] = optimizable.getParameter(x);
            System.out.print(weights[x] + ", ");
        }
        System.out.println();
        diff /= weights.length;

        System.out.println("Current difference: " + diff);
    } while (diff > 0.0005);

    //GENERATE TRAINING DATA USING KLEIN RERANKER
    //assumes the 'test' data from KleinBilingualParser.java is the unannotated data
    //that the reranker has to annotate.

    factLBf = new Evalb("factor LP/LR", runningAveragesF);
    MemoryTreebank eddyRoseFullTrainTreebank = new MemoryTreebank();
    eddyRoseFullTrainTreebank.addAll(trainTreebankF);
    eddyRoseFullTrainTreebank.addAll(bitrainTreebankF);

    Treebank unannotTreebankF = testTreebankF;
    Treebank annotTreebankE = testTreebankE;
    Iterator<Tree> eTreesBling = unannotTreebankF.iterator();
    Iterator<Tree> fTreesBling = annotTreebankE.iterator();

    int i = 0;
    Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator();
    while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) {
        HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next();

        Tree fTree = fTreesBling.next();
        Tree eTree = eTreesBling.next();

        List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords());
        LexicalizedParserQuery lpqF = (LexicalizedParserQuery) fSelfTrainFinal.parserQuery();
        lpqF.parse(sentenceF);
        List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF);

        int j = 0;
        int k = 0;

        double maxScore = -Double.MAX_VALUE;
        Tree bestFtree = null;

        for (ScoredObject<Tree> fScoredObj : kBestF) {
            eTree.setSpans();
            fScoredObj.object().setSpans();
            HashMap<Tree, Tree> alignment = getHungarianAlignment(eTree, fScoredObj.object(), weights,
                    alignMap);

            double currentScore = 0.0;

            for (Map.Entry entry : alignment.entrySet()) {
                Tree nodeF = (Tree) entry.getKey();
                Tree nodeE = (Tree) entry.getValue();

                currentScore += weights[0] * 0.0;//because gold standard tree is assumed to have probability 1
                currentScore += weights[1] * fScoredObj.score() / 1000;
                currentScore += weights[2] * spanDiff(nodeF, nodeE);
                currentScore += weights[3] * numChildren(nodeF, nodeE);
                currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap);
                currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap);
                currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap);
                currentScore += weights[7] * bias(nodeF, nodeE);
            }

            if (currentScore > maxScore) {
                maxScore = currentScore;

                bestFtree = fScoredObj.object();
            }

            k++;
        }
        i++;

        System.out.println("Reranker " + i);
        eddyRoseFullTrainTreebank.add(bestFtree);
        factLBf.evaluate(bestFtree, fTree);
    }

    LexicalizedParser lpEddyRose = getParserFromTreebank(eddyRoseFullTrainTreebank, null,
            secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null);
    EvaluateTreebank evaluator = new EvaluateTreebank(lpEddyRose);
    double eddyRoseF1 = evaluator.testOnTreebank(seqTestTreebank);

    System.out.println("------------------------");
    System.out.println("    EddyRose Results    ");
    System.out.println("------------------------");
    System.out.println("Test set F1: " + eddyRoseF1);
    System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent());
}

From source file:Ceist.CeistView.java

License:Open Source License

/**
 * Begin a search/*from  w  ww .  ja  v a  2s . c o  m*/
 */
private void runSearch() {
    //setTregexState(true); Disable buttons while searching

    Thread searchThread = new Thread() {
        @Override
        public void run() {
            lblSearchStatus.setText("Searching...");

            // Initialise search patterns
            final TregexPattern primary = MatchPattern.getMatchPattern(txtCurrentPattern);

            if (primary == null) {
                lblSearchStatus.setText("Bad Pattern!");
                return;
            }

            SwingUtilities.invokeLater(new Runnable() {
                public void run() {

                    Treebank treebank = new MemoryTreebank();

                    // Add the test data set if selected and loaded
                    if (dataSet.testData.isLoaded() && btnUseTestData.isSelected())
                        treebank.addAll(dataSet.testData.getTreebank());

                    // Add the development data set if selected and loaded
                    if (dataSet.testData.isLoaded() && btnUseDevelopmentData.isSelected())
                        treebank.addAll(dataSet.devData.getTreebank());

                    int treeCount = treebank.size();
                    int count = 0;

                    // Copy the current matches
                    diffTrees.clear();
                    diffTrees.addAll(matchedTrees);

                    if (!chkShowPreview.isSelected()) {
                        matchedTrees.clear();

                        // Clear the table
                        ((DefaultTableModel) tableMatches.getModel()).setRowCount(0);
                    }

                    for (Tree testTree : treebank) {
                        count++;
                        lblSearchStatus.setText(String.format("Searching %d of %d", count, treeCount));
                        TregexMatcher m = primary.matcher(testTree);

                        //Tree lastMatchingRootNode = null;
                        boolean bMatchFound = false;

                        while (m.find() && !bMatchFound) {

                            matchedTrees.add(testTree);

                            if (chkShowTagged.isSelected())
                                ((DefaultTableModel) tableMatches.getModel())
                                        .addRow(getMatcherTableRow(m, testTree, true));
                            else
                                ((DefaultTableModel) tableMatches.getModel())
                                        .addRow(getMatcherTableRow(m, testTree, false));
                            bMatchFound = true;
                        }
                    }

                    if (matchedTrees.size() > 0)
                        lblSearchStatus.setText(String.format("Found %d matches.", matchedTrees.size()));
                    else
                        lblSearchStatus.setText(String.format("No matches found!"));
                }
            });
        }
    };

    searchThread.start();
}

From source file:info.mhaas.ma.Evaluation.ContrastiveConjunctions.java

public ContrastiveConjunctions(File predicted, File gold, File parses) {

    // Must be MemoryTreebank because others do not support
    // List interface for get()
    this.predicted = new MemoryTreebank();
    this.predicted.loadPath(predicted, null);
    this.gold = new MemoryTreebank();
    this.gold.loadPath(gold, null);
    this.parses = new MemoryTreebank();
    this.parses.loadPath(parses, null);
    assert this.parses.size() == this.predicted.size();
    assert this.predicted.size() == this.gold.size();
    this.matchedTrees = new HashSet<>();
    this.matches = new ArrayList<>();

}