List of usage examples for edu.stanford.nlp.parser.lexparser Options Options
public Options()
From source file:LexicalizedParserUnsupervisedDA.java
public static void main(String[] args) { boolean seed = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPath = null;//w w w .j av a 2s. c o m Treebank selfTrainTreebank = null; MemoryTreebank finalTrainTreebank = null; Treebank tuneTreebank = null; String testPath = null; String inTestPath = null; String selfTrainPath = null; FileFilter testFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilter = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); List<String> optionArgs = new ArrayList<>(); String encoding = null; // while loop through option arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-inTest")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-inTest"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; inTestPath = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPath = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-seed")) { seed = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-seed"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPath = treebankDescription.first(); trainFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-train2")) { // train = true; // cdm july 2005: should require -train for this Triple<String, FileFilter, Double> treebankDescription = ArgUtils .getWeightedTreebankDescription(args, argIndex, "-train2"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; secondaryTreebankPath = treebankDescription.first(); secondaryTrainFilter = treebankDescription.second(); secondaryTreebankWeight = treebankDescription.third(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encoding = args[argIndex + 1]; op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenized")) { tokenized = true; argIndex += 1; } else if (args[argIndex].equalsIgnoreCase("-escaper")) { try { escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]); } catch (Exception e) { log.info("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) { tokenizerOptions = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) { tokenizerFactoryClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) { tokenizerMethod = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentences")) { sentenceDelimiter = args[argIndex + 1]; if (sentenceDelimiter.equalsIgnoreCase("newline")) { sentenceDelimiter = "\n"; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parseInside")) { elementDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) { tagDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // load the parser from declarative text file // the next argument must be the path to the parser file textInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) { saveToSerializedFile = true; if (ArgUtils.numSubArgs(args, argIndex) < 1) { log.info("Missing path: -saveToSerialized filename"); } else { serializedOutputFileOrUrl = args[argIndex + 1]; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) { // save the training trees to a binary file op.trainOptions.trainTreeFile = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-selfTrain")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-selfTrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; selfTrainPath = treebankDescription.first(); testFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tune")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; tunePath = treebankDescription.first(); tuneFilter = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = op.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments // all other arguments are order dependent and // are processed in order below if (tuneFilter != null || tunePath != null) { if (tunePath == null) { if (treebankPath == null) { throw new RuntimeException("No tune treebank path specified..."); } else { log.info("No tune treebank path specified. Using train path: \"" + treebankPath + '\"'); tunePath = treebankPath; } } tuneTreebank = op.tlpParams.testMemoryTreebank(); tuneTreebank.loadPath(tunePath, tuneFilter); } // if (!train && op.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lp; // always initialized in next if-then-else block if (seed) { //StringUtils.logInvocationString(log, args); // so we train a parser using the treebank GrammarCompactor compactor = null; if (op.trainOptions.compactGrammar() == 3) { compactor = new ExactGrammarCompactor(op, false, false); } Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter); finalTrainTreebank = new MemoryTreebank(); finalTrainTreebank.addAll(trainTreebank); Treebank secondaryTrainTreebank = null; if (secondaryTreebankPath != null) { secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter); } List<List<TaggedWord>> extraTaggedWords = null; if (op.trainOptions.taggedFiles != null) { extraTaggedWords = new ArrayList<>(); List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles); for (TaggedFileRecord record : fileRecords) { for (List<TaggedWord> sentence : record.reader()) { extraTaggedWords.add(sentence); } } } op.testOptions.quietEvaluation = true; lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords); } else if (textInputFileOrUrl != null) { // so we load the parser from a text grammar file lp = getParserFromTextFile(textInputFileOrUrl, op); } else { // so we load a serialized parser - if (serializedInputFileOrUrl == null && argIndex < args.length) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } if (serializedInputFileOrUrl == null) { log.info("No grammar specified, exiting..."); return; } String[] extraArgs = new String[optionArgs.size()]; extraArgs = optionArgs.toArray(extraArgs); try { lp = loadModel(serializedInputFileOrUrl, op, extraArgs); op.setOptions(extraArgs);//CHANGED } catch (IllegalArgumentException e) { log.info("Error loading parser, exiting..."); throw e; } } // set up tokenizerFactory with options if provided if (tokenizerFactoryClass != null || tokenizerOptions != null) { try { if (tokenizerFactoryClass != null) { Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils .uncheckedCast(Class.forName(tokenizerFactoryClass)); Method factoryMethod; if (tokenizerOptions != null) { factoryMethod = clazz.getMethod( tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions)); } else { factoryMethod = clazz .getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory"); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null)); } } else { // have options but no tokenizer factory. use the parser // langpack's factory and set its options tokenizerFactory = op.langpack().getTokenizerFactory(); tokenizerFactory.setOptions(tokenizerOptions); } } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) { log.info("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions); throw new RuntimeException(e); } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encoding != null) { op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); } if (testFilter != null || selfTrainPath != null) { if (selfTrainPath == null) { if (treebankPath == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPath + '\"'); selfTrainPath = treebankPath; } } selfTrainTreebank = op.tlpParams.testMemoryTreebank(); selfTrainTreebank.loadPath(selfTrainPath, testFilter); } op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. -- Roger // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { lp.saveParserToTextFile(textOutputFileOrUrl); } else { log.info("Usage: must specify a text grammar output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl != null) { lp.saveParserToSerialized(serializedOutputFileOrUrl); } else if (textOutputFileOrUrl == null && selfTrainTreebank == null) { // no saving/parsing request has been specified log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-seed trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename"); } } if (op.testOptions.verbose || seed) { // Tell the user a little or a lot about what we have made // get lexicon size separately as it may have its own prints in it.... String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : ""; log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules() : "") + '\t' + (lp.bg != null ? lp.bg.numRules() : "") + '\t' + lexNumRules); log.info("ParserPack is " + op.tlpParams.getClass().getName()); log.info("Lexicon is " + lp.lex.getClass().getName()); if (op.testOptions.verbose) { log.info("Tags are: " + lp.tagIndex); // log.info("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)! } printOptions(false, op); } if (selfTrainTreebank != null) { Treebank selfTrainTest = makeTreebank(testPath, op, null); Treebank inTest = makeTreebank(inTestPath, op, null); EvaluateTreebank evaluator = new EvaluateTreebank(lp); double baseLineOutDomain = evaluator.testOnTreebank(selfTrainTest); double baseLineInDomain = evaluator.testOnTreebank(inTest); // annotate unlabeled data System.out.println("Starting selftraining..."); int i = 0; for (Tree goldTree : selfTrainTreebank) { List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords()); ; finalTrainTreebank.add(lp.parseTree(sentence)); System.out.println("Self-training : " + (++i)); } System.out.println("Finished creating the final dataset"); GrammarCompactor compactor = null; if (op.trainOptions.compactGrammar() == 3) { compactor = new ExactGrammarCompactor(op, false, false); } op.testOptions.quietEvaluation = true; lp = getParserFromTreebank(finalTrainTreebank, null, 1.0, compactor, op, tuneTreebank, null); evaluator = new EvaluateTreebank(lp); double finalF1 = evaluator.testOnTreebank(selfTrainTest); System.out.println("------------------------"); System.out.println("The results that matter:"); System.out.println("------------------------"); System.out.println("Baseline In Domain F1 : " + baseLineInDomain); System.out.println("Baseline Out Domain F1 : " + baseLineOutDomain); System.out.println("Self-Trained Out Domain F1 : " + finalF1); } else if (argIndex >= args.length) { // no more arguments, so we just parse our own test sentence PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); ParserQuery pq = lp.parserQuery(); if (pq.parse(op.tlpParams.defaultTestSentence())) { lp.getTreePrint().printTree(pq.getBestParse(), pwOut); } else { pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence()); } } else { // We parse filenames given by the remaining arguments ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp); } }
From source file:KleinBilingualParser.java
public static void main(String[] args) { boolean trainF = false; boolean trainE = false; boolean bitrainE = false; boolean bitrainF = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPathF = null;//from w ww.j av a2s .c o m Treebank testTreebankF = null; Treebank tuneTreebankF = null; String testPathF = null; FileFilter testFilterF = null; String treebankPathE = null; Treebank testTreebankE = null; Treebank tuneTreebankE = null; String testPathE = null; FileFilter testFilterE = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilterF = null; FileFilter trainFilterE = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; String trainAlignFile = null; String testAlignFile = null; String bitrainPathE = null; FileFilter bitrainFilterE = null; String bitrainPathF = null; FileFilter bitrainFilterF = null; Treebank bitrainTreebankF = null; Treebank bitrainTreebankE = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options fOp = new Options(); Options eOp = new Options(); List<String> optionArgs = new ArrayList<>(); String encodingF = null; // while loop through option arguments while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathF = treebankDescription.first(); trainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathF = treebankDescription.first(); bitrainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encodingF = args[argIndex + 1]; fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathF = treebankDescription.first(); testFilterF = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = fOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } System.out.println(argIndex + " " + args.length); } // end while loop through arguments for french argIndex++;//go to english arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathE = treebankDescription.first(); trainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathE = treebankDescription.first(); bitrainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathE = treebankDescription.first(); testFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-trainAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; trainAlignFile = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testAlignFile = treebankDescription.first(); } else { int oldIndex = argIndex; argIndex = eOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments for english // if (!train && fOp.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lpF; // always initialized in next if-then-else block LexicalizedParser lpE; //TRAIN A PARSER // so we train a parser using the treebank GrammarCompactor compactorF = null; GrammarCompactor compactorE = null; if (fOp.trainOptions.compactGrammar() == 3) { compactorF = new ExactGrammarCompactor(fOp, false, false); } if (eOp.trainOptions.compactGrammar() == 3) { compactorE = new ExactGrammarCompactor(eOp, false, false); } Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF); Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE); fOp.testOptions.quietEvaluation = true; eOp.testOptions.quietEvaluation = true; lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE, null); // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (bitrainFilterF != null || bitrainPathF != null) { if (bitrainPathF == null) { //? if (treebankPathF == null) { throw new RuntimeException("No bitrain treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); bitrainPathF = treebankPathF; } } bitrainTreebankF = fOp.tlpParams.testMemoryTreebank(); bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF); } if (bitrainFilterE != null || bitrainPathE != null) { if (bitrainPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); bitrainPathE = treebankPathE; } } bitrainTreebankE = eOp.tlpParams.testMemoryTreebank(); bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE); } if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (testFilterF != null || testPathF != null) { if (testPathF == null) { if (treebankPathF == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); testPathF = treebankPathF; } } testTreebankF = fOp.tlpParams.testMemoryTreebank(); testTreebankF.loadPath(testPathF, testFilterF); } fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters())); if (testFilterE != null || testPathE != null) { if (testPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); testPathE = treebankPathE; } } testTreebankE = eOp.tlpParams.testMemoryTreebank(); testTreebankE.loadPath(testPathE, testFilterE); } eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters())); //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX double[] weights = new double[8]; double diff; weights[0] = 0.01; weights[1] = -0.002; weights[2] = 0.002; weights[3] = 0.002; weights[4] = 0.002; weights[5] = 0.002; weights[6] = -0.002; weights[7] = -0.002; ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null; ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null; //String alignFile="../../berkeleyaligner/output/test.align"; try { AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile); bitrainAlignments = trainAP.createAlignments(); AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile); testAlignments = testAP.createAlignments(); } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } int kE = 10; int kF = 10; int numFeatures = 8; int numBigSentences = 0; do { diff = 0.0; Iterator<Tree> eTrees = bitrainTreebankE.iterator(); Iterator<Tree> fTrees = bitrainTreebankF.iterator(); Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator(); numBigSentences = 0; //features are used in the order they are defined double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF]; int ePsGold[] = new int[bitrainTreebankE.size()]; int fPsGold[] = new int[bitrainTreebankF.size()]; int i = 0; while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next(); Tree fTree = fTrees.next(); Tree eTree = eTrees.next(); if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) { //System.out.println("Too big : " + i); numBigSentences++; fPsGold[i] = 3; ePsGold[i] = 3; i++; continue; } List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); fPsGold[i] = 3; ePsGold[i] = 3; int j = 0; int k = 0; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); //had to reduce likelihood scores by factor of 10 to keep the optimizer working A[i][0][j][k] = eScoredObj.score() / 1000; A[i][1][j][k] = fScoredObj.score() / 1000; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); A[i][2][j][k] += spanDiff(nodeF, nodeE); A[i][3][j][k] += numChildren(nodeF, nodeE); A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap); A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap); A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap); A[i][7][j][k] += bias(nodeF, nodeE); } k++; } j++; } //System.out.println("Sentence " + i); i++; } /////////////////////// // // MALLET optimizer // /////////////////////// System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); System.out.println("Beginning convex optimization..."); System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold); Optimizer optimizer = new LimitedMemoryBFGS(optimizable); boolean converged = false; try { converged = optimizer.optimize(); } catch (IllegalArgumentException e) { // This exception may be thrown if L-BFGS // cannot step in the current direction. // This condition does not necessarily mean that // the optimizer has failed, but it doesn't want // to claim to have succeeded... } catch (cc.mallet.optimize.OptimizationException e) { System.out.println(e.getMessage()); } for (int x = 0; x < weights.length; x++) { diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]); weights[x] = optimizable.getParameter(x); System.out.print(weights[x] + ", "); } System.out.println(); diff /= weights.length; System.out.println("Current difference: " + diff); } while (diff > 0.0005); //TESTING BILINGUAL PARSER Treebank bilingTestTreebankF = testTreebankF; Treebank bilingTestTreebankE = testTreebankE; Iterator<Tree> eTreesBling = testTreebankE.iterator(); Iterator<Tree> fTreesBling = testTreebankF.iterator(); boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages")); boolean runningAveragesE = Boolean.parseBoolean(eOp.testOptions.evals.getProperty("runningAverages")); AbstractEval pcfgLBf = new Evalb("pcfg LP/LR", runningAveragesF); AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF); AbstractEval pcfgLBe = new Evalb("pcfg LP/LR", runningAveragesE); AbstractEval factLBe = new Evalb("factor LP/LR", runningAveragesE); int i = 0; Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator(); while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next(); Tree fTree = fTreesBling.next(); Tree eTree = eTreesBling.next(); List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); int j = 0; int k = 0; double maxScore = -Double.MAX_VALUE; Tree bestFtree = null; Tree bestEtree = null; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); double currentScore = 0.0; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); currentScore += weights[0] * eScoredObj.score() / 1000; currentScore += weights[1] * fScoredObj.score() / 1000; currentScore += weights[2] * spanDiff(nodeF, nodeE); currentScore += weights[3] * numChildren(nodeF, nodeE); currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap); currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap); currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap); currentScore += weights[7] * bias(nodeF, nodeE); } if (currentScore > maxScore) { maxScore = currentScore; bestFtree = fScoredObj.object(); bestEtree = eScoredObj.object(); } k++; } j++; } i++; pcfgLBe.evaluate(bestEtree, eTree); factLBe.evaluate(bestEtree, eTree); pcfgLBf.evaluate(bestFtree, fTree); factLBf.evaluate(bestFtree, fTree); } System.out.println("------------------------"); System.out.println(" English Results "); System.out.println("------------------------"); System.out.println("PCFG labeled f1: " + pcfgLBe.getEvalbF1Percent()); System.out.println("Factored labeled f1: " + factLBe.getEvalbF1Percent()); System.out.println("------------------------"); System.out.println(" French Results "); System.out.println("------------------------"); System.out.println("PCFG labeled f1: " + pcfgLBf.getEvalbF1Percent()); System.out.println("Factored labeled f1: " + factLBf.getEvalbF1Percent()); System.out.println("------------------------"); System.out.println("Number of sentences too big: " + numBigSentences); }
From source file:EddyRoseDomainAdaptation.java
public static void main(String[] args) { boolean trainF = false; boolean trainE = false; boolean bitrainE = false; boolean bitrainF = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPathF = null;//from ww w . ja v a2 s .c om Treebank testTreebankF = null; Treebank seqTestTreebank = null; Treebank tuneTreebankF = null; String testPathF = null; FileFilter testFilterF = null; String treebankPathE = null; Treebank testTreebankE = null; Treebank tuneTreebankE = null; String testPathE = null; FileFilter testFilterE = null; String seqTestPath = null; FileFilter seqTestFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilterF = null; FileFilter trainFilterE = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; String trainAlignFile = null; String testAlignFile = null; String bitrainPathE = null; FileFilter bitrainFilterE = null; String bitrainPathF = null; FileFilter bitrainFilterF = null; Treebank bitrainTreebankF = null; Treebank bitrainTreebankE = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options fOp = new Options(); Options eOp = new Options(); List<String> optionArgs = new ArrayList<>(); String encodingF = null; // while loop through option arguments while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathF = treebankDescription.first(); trainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathF = treebankDescription.first(); bitrainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encodingF = args[argIndex + 1]; fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathF = treebankDescription.first(); testFilterF = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = fOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } System.out.println(argIndex + " " + args.length); } // end while loop through arguments for french argIndex++;//go to english arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathE = treebankDescription.first(); trainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathE = treebankDescription.first(); bitrainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathE = treebankDescription.first(); testFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-seqtest")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; seqTestPath = treebankDescription.first(); seqTestFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-trainAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; trainAlignFile = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testAlignFile = treebankDescription.first(); } else { int oldIndex = argIndex; argIndex = eOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments for english // if (!train && fOp.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lpF; // always initialized in next if-then-else block LexicalizedParser lpE; //TRAIN A PARSER // so we train a parser using the treebank GrammarCompactor compactorF = null; if (fOp.trainOptions.compactGrammar() == 3) { compactorF = new ExactGrammarCompactor(fOp, false, false); } Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF); fOp.testOptions.quietEvaluation = true; GrammarCompactor compactorE = null; if (eOp.trainOptions.compactGrammar() == 3) { compactorE = new ExactGrammarCompactor(eOp, false, false); } Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE); eOp.testOptions.quietEvaluation = true; lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE, null); // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (bitrainFilterF != null || bitrainPathF != null) { if (bitrainPathF == null) { //? if (treebankPathF == null) { throw new RuntimeException("No bitrain treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); bitrainPathF = treebankPathF; } } bitrainTreebankF = fOp.tlpParams.testMemoryTreebank(); bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF); } if (bitrainFilterE != null || bitrainPathE != null) { if (bitrainPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); bitrainPathE = treebankPathE; } } bitrainTreebankE = eOp.tlpParams.testMemoryTreebank(); bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE); } if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (testFilterF != null || testPathF != null) { if (testPathF == null) { if (treebankPathF == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); testPathF = treebankPathF; } } testTreebankF = fOp.tlpParams.testMemoryTreebank(); testTreebankF.loadPath(testPathF, testFilterF); } //generate sequioa treebank seqTestTreebank = fOp.tlpParams.testMemoryTreebank(); seqTestTreebank.loadPath(seqTestPath, seqTestFilter); fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters())); if (testFilterE != null || testPathE != null) { if (testPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); testPathE = treebankPathE; } } testTreebankE = eOp.tlpParams.testMemoryTreebank(); testTreebankE.loadPath(testPathE, testFilterE); } eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters())); ////////////////////// // // Self-Training // ////////////////////// MemoryTreebank selfTrainInitTreebank = new MemoryTreebank(); MemoryTreebank selfTrainFinalTreebank = new MemoryTreebank(); selfTrainInitTreebank.addAll(trainTreebankF); selfTrainFinalTreebank.addAll(trainTreebankF); LexicalizedParser fSelfTrainInit = getParserFromTreebank(selfTrainInitTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); int z = 0; boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages")); AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF); for (Tree goldTree : testTreebankF) { List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords()); Tree guessTree = fSelfTrainInit.parseTree(sentence); selfTrainFinalTreebank.add(guessTree); factLBf.evaluate(guessTree, goldTree); System.out.println("Self-training : " + (++z)); } LexicalizedParser fSelfTrainFinal = getParserFromTreebank(selfTrainFinalTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); EvaluateTreebank evaluatorH = new EvaluateTreebank(fSelfTrainFinal); double scoreF1 = evaluatorH.testOnTreebank(seqTestTreebank); System.out.println("------------------------"); System.out.println(" Self Train Results "); System.out.println("------------------------"); System.out.println("Test set F1: " + scoreF1); System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent()); ////////////////////// ////////////////////// //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX double[] weights = new double[8]; double diff; weights[0] = 0.01; weights[1] = -0.002; weights[2] = 0.002; weights[3] = 0.002; weights[4] = 0.002; weights[5] = 0.002; weights[6] = -0.002; weights[7] = -0.002; ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null; ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null; //String alignFile="../../berkeleyaligner/output/test.align"; try { AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile); bitrainAlignments = trainAP.createAlignments(); AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile); testAlignments = testAP.createAlignments(); } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } int kE = 10; int kF = 10; int numFeatures = 8; int numBigSentences = 0; do { diff = 0.0; Iterator<Tree> eTrees = bitrainTreebankE.iterator(); Iterator<Tree> fTrees = bitrainTreebankF.iterator(); Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator(); numBigSentences = 0; //features are used in the order they are defined double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF]; int ePsGold[] = new int[bitrainTreebankE.size()]; int fPsGold[] = new int[bitrainTreebankF.size()]; int i = 0; while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next(); Tree fTree = fTrees.next(); Tree eTree = eTrees.next(); if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) { //System.out.println("Too big : " + i); numBigSentences++; fPsGold[i] = 3; ePsGold[i] = 3; i++; continue; } List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); fPsGold[i] = 3; ePsGold[i] = 3; int j = 0; int k = 0; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); //had to reduce likelihood scores by factor of 10 to keep the optimizer working A[i][0][j][k] = eScoredObj.score() / 1000; A[i][1][j][k] = fScoredObj.score() / 1000; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); A[i][2][j][k] += spanDiff(nodeF, nodeE); A[i][3][j][k] += numChildren(nodeF, nodeE); A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap); A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap); A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap); A[i][7][j][k] += bias(nodeF, nodeE); } k++; } j++; } //System.out.println("Sentence " + i); i++; } /////////////////////// // // MALLET optimizer // /////////////////////// System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); System.out.println("Beginning convex optimization..."); System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold); Optimizer optimizer = new LimitedMemoryBFGS(optimizable); boolean converged = false; try { converged = optimizer.optimize(); } catch (IllegalArgumentException e) { // This exception may be thrown if L-BFGS // cannot step in the current direction. // This condition does not necessarily mean that // the optimizer has failed, but it doesn't want // to claim to have succeeded... } catch (cc.mallet.optimize.OptimizationException e) { System.out.println(e.getMessage()); } for (int x = 0; x < weights.length; x++) { diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]); weights[x] = optimizable.getParameter(x); System.out.print(weights[x] + ", "); } System.out.println(); diff /= weights.length; System.out.println("Current difference: " + diff); } while (diff > 0.0005); //GENERATE TRAINING DATA USING KLEIN RERANKER //assumes the 'test' data from KleinBilingualParser.java is the unannotated data //that the reranker has to annotate. factLBf = new Evalb("factor LP/LR", runningAveragesF); MemoryTreebank eddyRoseFullTrainTreebank = new MemoryTreebank(); eddyRoseFullTrainTreebank.addAll(trainTreebankF); eddyRoseFullTrainTreebank.addAll(bitrainTreebankF); Treebank unannotTreebankF = testTreebankF; Treebank annotTreebankE = testTreebankE; Iterator<Tree> eTreesBling = unannotTreebankF.iterator(); Iterator<Tree> fTreesBling = annotTreebankE.iterator(); int i = 0; Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator(); while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next(); Tree fTree = fTreesBling.next(); Tree eTree = eTreesBling.next(); List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) fSelfTrainFinal.parserQuery(); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); int j = 0; int k = 0; double maxScore = -Double.MAX_VALUE; Tree bestFtree = null; for (ScoredObject<Tree> fScoredObj : kBestF) { eTree.setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eTree, fScoredObj.object(), weights, alignMap); double currentScore = 0.0; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); currentScore += weights[0] * 0.0;//because gold standard tree is assumed to have probability 1 currentScore += weights[1] * fScoredObj.score() / 1000; currentScore += weights[2] * spanDiff(nodeF, nodeE); currentScore += weights[3] * numChildren(nodeF, nodeE); currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap); currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap); currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap); currentScore += weights[7] * bias(nodeF, nodeE); } if (currentScore > maxScore) { maxScore = currentScore; bestFtree = fScoredObj.object(); } k++; } i++; System.out.println("Reranker " + i); eddyRoseFullTrainTreebank.add(bestFtree); factLBf.evaluate(bestFtree, fTree); } LexicalizedParser lpEddyRose = getParserFromTreebank(eddyRoseFullTrainTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); EvaluateTreebank evaluator = new EvaluateTreebank(lpEddyRose); double eddyRoseF1 = evaluator.testOnTreebank(seqTestTreebank); System.out.println("------------------------"); System.out.println(" EddyRose Results "); System.out.println("------------------------"); System.out.println("Test set F1: " + eddyRoseF1); System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent()); }
From source file:edu.cmu.ark.AnalysisUtilities.java
License:Open Source License
public ParseResult parseSentence(String sentence) { String result = ""; //System.err.println(sentence); //see if a parser socket server is available int port = new Integer(GlobalProperties.getProperties().getProperty("parserServerPort", "5556")); String host = "127.0.0.1"; Socket client;// ww w . j ava 2 s . com PrintWriter pw; BufferedReader br; String line; Tree parse = null; double parseScore = Double.MIN_VALUE; try { client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); pw.println(sentence); pw.flush(); //flush to complete the transmission while ((line = br.readLine()) != null) { //if(!line.matches(".*\\S.*")){ // System.out.println(); //} if (br.ready()) { line = line.replaceAll("\n", ""); line = line.replaceAll("\\s+", " "); result += line + " "; } else { parseScore = new Double(line); } } br.close(); pw.close(); client.close(); if (parse == null) { parse = readTreeFromString("(ROOT (. .))"); parseScore = -99999.0; } if (GlobalProperties.getDebug()) System.err.println("result (parse):" + result); parse = readTreeFromString(result); return new ParseResult(true, parse, parseScore); } catch (Exception ex) { if (GlobalProperties.getDebug()) System.err.println("Could not connect to parser server."); //ex.printStackTrace(); } System.err.println("parsing:" + sentence); //if socket server not available, then use a local parser object if (parser == null) { try { Options op = new Options(); String serializedInputFileOrUrl = GlobalProperties.getProperties().getProperty("parserGrammarFile", "config" + File.separator + "englishFactored.ser.gz"); parser = new LexicalizedParser(serializedInputFileOrUrl, op); int maxLength = new Integer(GlobalProperties.getProperties().getProperty("parserMaxLength", "40")) .intValue(); parser.setMaxLength(maxLength); parser.setOptionFlags("-outputFormat", "oneline"); } catch (Exception e) { e.printStackTrace(); } } try { if (parser.parse(sentence)) { parse = parser.getBestParse(); //remove all the parent annotations (this is a hacky way to do it) String ps = parse.toString().replaceAll("\\[[^\\]]+/[^\\]]+\\]", ""); parse = AnalysisUtilities.getInstance().readTreeFromString(ps); parseScore = parser.getPCFGScore(); return new ParseResult(true, parse, parseScore); } } catch (Exception e) { } parse = readTreeFromString("(ROOT (. .))"); parseScore = -99999.0; return new ParseResult(false, parse, parseScore); }
From source file:edu.cmu.cs.in.hoop.visualizers.HoopParseTreeViewer.java
License:Open Source License
/** * /*from w w w .jav a2 s . co m*/ */ public HoopParseTreeViewer() { //this.setLayout(new BoxLayout (this,BoxLayout.Y_AXIS)); Options theseOptions = new Options(); theseOptions.setOptions(new String[] {}); String theseExtraFlags[] = new String[] { "-maxLength", "256", "-retainTmpSubcategories" }; //theLexicalizedParser = LexicalizedParser.loadModel(theLexicalizedParserName, theseOptions, theseExtraFlags); theLexicalizedParser = LexicalizedParser.loadModel(); model = new DefaultListModel(); sentenceList = new JList(model); sentenceList.setOpaque(true); sentenceList.setBackground(new Color(220, 220, 220)); sentenceList.addMouseListener(this); JScrollPane scroller = new JScrollPane(sentenceList); scroller.setVerticalScrollBarPolicy(JScrollPane.VERTICAL_SCROLLBAR_ALWAYS); Box controlBox = Box.createHorizontalBox(); entry = new JTextArea(); entry.setFont(new Font("Dialog", 1, 10)); //entry.setBorder(blackborder); entry.setMinimumSize(new Dimension(100, 25)); entry.setPreferredSize(new Dimension(200, 25)); entry.setMaximumSize(new Dimension(200, 25)); //entry.setText("The quick brown fox jumps over the lazy dog."); parseButton = new JButton(); //parseButton.setIcon(HoopLink.imageIcons [8]); parseButton.setMargin(new Insets(1, 1, 1, 1)); parseButton.setText("Parse"); parseButton.setFont(new Font("Courier", 1, 8)); parseButton.setPreferredSize(new Dimension(20, 16)); parseButton.addActionListener(this); controlBox.add(entry); controlBox.add(parseButton); controlBox.add(Box.createHorizontalGlue()); treePanel = new TreeJPanel(); JPanel content = (JPanel) getContentPane(); content.setLayout(new BoxLayout(content, BoxLayout.Y_AXIS)); //this.add(controlBox); //this.add(scroller); //this.add (treePanel); content.add(controlBox); content.add(scroller); content.add(treePanel); }
From source file:MedArkRef.AnalysisUtilities.java
License:Open Source License
public arkref.parsestuff.AnalysisUtilities.ParseResult parseSentence(String sentence) { String result = ""; //System.err.println(sentence); //see if a parser socket server is available int port = new Integer(GlobalProperties.getProperties().getProperty("parserServerPort", "5556")); String host = "127.0.0.1"; Socket client;//from w ww . j a va 2s . c om PrintWriter pw; BufferedReader br; String line; Tree parse = null; double parseScore = Double.MIN_VALUE; try { client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); pw.println(sentence); pw.flush(); //flush to complete the transmission while ((line = br.readLine()) != null) { //if(!line.matches(".*\\S.*")){ // System.out.println(); //} if (br.ready()) { line = line.replaceAll("\n", ""); line = line.replaceAll("\\s+", " "); result += line + " "; } else { parseScore = new Double(line); } } br.close(); pw.close(); client.close(); if (parse == null) { parse = readTreeFromString("(ROOT (. .))"); parseScore = -99999.0; } if (GlobalProperties.getDebug()) System.err.println("result (parse):" + result); parse = readTreeFromString(result); return new arkref.parsestuff.AnalysisUtilities.ParseResult(true, parse, parseScore); } catch (Exception ex) { if (GlobalProperties.getDebug()) System.err.println("Could not connect to parser server."); //ex.printStackTrace(); } System.err.println("parsing:" + sentence); //if socket server not available, then use a local parser object if (parser == null) { try { Options op = new Options(); String serializedInputFileOrUrl = GlobalProperties.getProperties().getProperty("parserGrammarFile", "config" + File.separator + "englishFactored.ser.gz"); parser = new LexicalizedParser(serializedInputFileOrUrl, op); int maxLength = new Integer(GlobalProperties.getProperties().getProperty("parserMaxLength", "40")) .intValue(); parser.setMaxLength(maxLength); parser.setOptionFlags("-outputFormat", "oneline"); } catch (Exception e) { e.printStackTrace(); } } try { if (parser.parse(sentence)) { parse = parser.getBestParse(); //remove all the parent annotations (this is a hacky way to do it) String ps = parse.toString().replaceAll("\\[[^\\]]+/[^\\]]+\\]", ""); parse = AnalysisUtilities.getInstance().readTreeFromString(ps); parseScore = parser.getPCFGScore(); return new arkref.parsestuff.AnalysisUtilities.ParseResult(true, parse, parseScore); } } catch (Exception e) { } parse = readTreeFromString("(ROOT (. .))"); parseScore = -99999.0; return new arkref.parsestuff.AnalysisUtilities.ParseResult(false, parse, parseScore); }
From source file:reck.parser.lexparser.RECKLexicalizedParser.java
License:Open Source License
/** * Construct a new LexicalizedParser object from a previously serialized * grammar read from a property//w w w .jav a 2 s . com * <code>edu.stanford.nlp.SerializedLexicalizedParser</code>, * or a default file location. */ public RECKLexicalizedParser() { super(new Options()); }