List of usage examples for edu.stanford.nlp.parser.lexparser LexicalizedParserQuery getKBestPCFGParses
public List<ScoredObject<Tree>> getKBestPCFGParses(int k)
From source file:KleinBilingualParser.java
public static void main(String[] args) { boolean trainF = false; boolean trainE = false; boolean bitrainE = false; boolean bitrainF = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPathF = null;//from w w w. j ava2s. co m Treebank testTreebankF = null; Treebank tuneTreebankF = null; String testPathF = null; FileFilter testFilterF = null; String treebankPathE = null; Treebank testTreebankE = null; Treebank tuneTreebankE = null; String testPathE = null; FileFilter testFilterE = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilterF = null; FileFilter trainFilterE = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; String trainAlignFile = null; String testAlignFile = null; String bitrainPathE = null; FileFilter bitrainFilterE = null; String bitrainPathF = null; FileFilter bitrainFilterF = null; Treebank bitrainTreebankF = null; Treebank bitrainTreebankE = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options fOp = new Options(); Options eOp = new Options(); List<String> optionArgs = new ArrayList<>(); String encodingF = null; // while loop through option arguments while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathF = treebankDescription.first(); trainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathF = treebankDescription.first(); bitrainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encodingF = args[argIndex + 1]; fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathF = treebankDescription.first(); testFilterF = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = fOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } System.out.println(argIndex + " " + args.length); } // end while loop through arguments for french argIndex++;//go to english arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathE = treebankDescription.first(); trainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathE = treebankDescription.first(); bitrainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathE = treebankDescription.first(); testFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-trainAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; trainAlignFile = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testAlignFile = treebankDescription.first(); } else { int oldIndex = argIndex; argIndex = eOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments for english // if (!train && fOp.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lpF; // always initialized in next if-then-else block LexicalizedParser lpE; //TRAIN A PARSER // so we train a parser using the treebank GrammarCompactor compactorF = null; GrammarCompactor compactorE = null; if (fOp.trainOptions.compactGrammar() == 3) { compactorF = new ExactGrammarCompactor(fOp, false, false); } if (eOp.trainOptions.compactGrammar() == 3) { compactorE = new ExactGrammarCompactor(eOp, false, false); } Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF); Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE); fOp.testOptions.quietEvaluation = true; eOp.testOptions.quietEvaluation = true; lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE, null); // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (bitrainFilterF != null || bitrainPathF != null) { if (bitrainPathF == null) { //? if (treebankPathF == null) { throw new RuntimeException("No bitrain treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); bitrainPathF = treebankPathF; } } bitrainTreebankF = fOp.tlpParams.testMemoryTreebank(); bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF); } if (bitrainFilterE != null || bitrainPathE != null) { if (bitrainPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); bitrainPathE = treebankPathE; } } bitrainTreebankE = eOp.tlpParams.testMemoryTreebank(); bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE); } if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (testFilterF != null || testPathF != null) { if (testPathF == null) { if (treebankPathF == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); testPathF = treebankPathF; } } testTreebankF = fOp.tlpParams.testMemoryTreebank(); testTreebankF.loadPath(testPathF, testFilterF); } fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters())); if (testFilterE != null || testPathE != null) { if (testPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); testPathE = treebankPathE; } } testTreebankE = eOp.tlpParams.testMemoryTreebank(); testTreebankE.loadPath(testPathE, testFilterE); } eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters())); //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX double[] weights = new double[8]; double diff; weights[0] = 0.01; weights[1] = -0.002; weights[2] = 0.002; weights[3] = 0.002; weights[4] = 0.002; weights[5] = 0.002; weights[6] = -0.002; weights[7] = -0.002; ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null; ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null; //String alignFile="../../berkeleyaligner/output/test.align"; try { AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile); bitrainAlignments = trainAP.createAlignments(); AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile); testAlignments = testAP.createAlignments(); } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } int kE = 10; int kF = 10; int numFeatures = 8; int numBigSentences = 0; do { diff = 0.0; Iterator<Tree> eTrees = bitrainTreebankE.iterator(); Iterator<Tree> fTrees = bitrainTreebankF.iterator(); Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator(); numBigSentences = 0; //features are used in the order they are defined double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF]; int ePsGold[] = new int[bitrainTreebankE.size()]; int fPsGold[] = new int[bitrainTreebankF.size()]; int i = 0; while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next(); Tree fTree = fTrees.next(); Tree eTree = eTrees.next(); if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) { //System.out.println("Too big : " + i); numBigSentences++; fPsGold[i] = 3; ePsGold[i] = 3; i++; continue; } List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); fPsGold[i] = 3; ePsGold[i] = 3; int j = 0; int k = 0; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); //had to reduce likelihood scores by factor of 10 to keep the optimizer working A[i][0][j][k] = eScoredObj.score() / 1000; A[i][1][j][k] = fScoredObj.score() / 1000; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); A[i][2][j][k] += spanDiff(nodeF, nodeE); A[i][3][j][k] += numChildren(nodeF, nodeE); A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap); A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap); A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap); A[i][7][j][k] += bias(nodeF, nodeE); } k++; } j++; } //System.out.println("Sentence " + i); i++; } /////////////////////// // // MALLET optimizer // /////////////////////// System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); System.out.println("Beginning convex optimization..."); System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold); Optimizer optimizer = new LimitedMemoryBFGS(optimizable); boolean converged = false; try { converged = optimizer.optimize(); } catch (IllegalArgumentException e) { // This exception may be thrown if L-BFGS // cannot step in the current direction. // This condition does not necessarily mean that // the optimizer has failed, but it doesn't want // to claim to have succeeded... } catch (cc.mallet.optimize.OptimizationException e) { System.out.println(e.getMessage()); } for (int x = 0; x < weights.length; x++) { diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]); weights[x] = optimizable.getParameter(x); System.out.print(weights[x] + ", "); } System.out.println(); diff /= weights.length; System.out.println("Current difference: " + diff); } while (diff > 0.0005); //TESTING BILINGUAL PARSER Treebank bilingTestTreebankF = testTreebankF; Treebank bilingTestTreebankE = testTreebankE; Iterator<Tree> eTreesBling = testTreebankE.iterator(); Iterator<Tree> fTreesBling = testTreebankF.iterator(); boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages")); boolean runningAveragesE = Boolean.parseBoolean(eOp.testOptions.evals.getProperty("runningAverages")); AbstractEval pcfgLBf = new Evalb("pcfg LP/LR", runningAveragesF); AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF); AbstractEval pcfgLBe = new Evalb("pcfg LP/LR", runningAveragesE); AbstractEval factLBe = new Evalb("factor LP/LR", runningAveragesE); int i = 0; Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator(); while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next(); Tree fTree = fTreesBling.next(); Tree eTree = eTreesBling.next(); List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); int j = 0; int k = 0; double maxScore = -Double.MAX_VALUE; Tree bestFtree = null; Tree bestEtree = null; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); double currentScore = 0.0; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); currentScore += weights[0] * eScoredObj.score() / 1000; currentScore += weights[1] * fScoredObj.score() / 1000; currentScore += weights[2] * spanDiff(nodeF, nodeE); currentScore += weights[3] * numChildren(nodeF, nodeE); currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap); currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap); currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap); currentScore += weights[7] * bias(nodeF, nodeE); } if (currentScore > maxScore) { maxScore = currentScore; bestFtree = fScoredObj.object(); bestEtree = eScoredObj.object(); } k++; } j++; } i++; pcfgLBe.evaluate(bestEtree, eTree); factLBe.evaluate(bestEtree, eTree); pcfgLBf.evaluate(bestFtree, fTree); factLBf.evaluate(bestFtree, fTree); } System.out.println("------------------------"); System.out.println(" English Results "); System.out.println("------------------------"); System.out.println("PCFG labeled f1: " + pcfgLBe.getEvalbF1Percent()); System.out.println("Factored labeled f1: " + factLBe.getEvalbF1Percent()); System.out.println("------------------------"); System.out.println(" French Results "); System.out.println("------------------------"); System.out.println("PCFG labeled f1: " + pcfgLBf.getEvalbF1Percent()); System.out.println("Factored labeled f1: " + factLBf.getEvalbF1Percent()); System.out.println("------------------------"); System.out.println("Number of sentences too big: " + numBigSentences); }
From source file:EddyRoseDomainAdaptation.java
public static void main(String[] args) { boolean trainF = false; boolean trainE = false; boolean bitrainE = false; boolean bitrainF = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPathF = null;// ww w .jav a2s. co m Treebank testTreebankF = null; Treebank seqTestTreebank = null; Treebank tuneTreebankF = null; String testPathF = null; FileFilter testFilterF = null; String treebankPathE = null; Treebank testTreebankE = null; Treebank tuneTreebankE = null; String testPathE = null; FileFilter testFilterE = null; String seqTestPath = null; FileFilter seqTestFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilterF = null; FileFilter trainFilterE = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; String trainAlignFile = null; String testAlignFile = null; String bitrainPathE = null; FileFilter bitrainFilterE = null; String bitrainPathF = null; FileFilter bitrainFilterF = null; Treebank bitrainTreebankF = null; Treebank bitrainTreebankE = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options fOp = new Options(); Options eOp = new Options(); List<String> optionArgs = new ArrayList<>(); String encodingF = null; // while loop through option arguments while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathF = treebankDescription.first(); trainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathF = treebankDescription.first(); bitrainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encodingF = args[argIndex + 1]; fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathF = treebankDescription.first(); testFilterF = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = fOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } System.out.println(argIndex + " " + args.length); } // end while loop through arguments for french argIndex++;//go to english arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathE = treebankDescription.first(); trainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathE = treebankDescription.first(); bitrainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathE = treebankDescription.first(); testFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-seqtest")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; seqTestPath = treebankDescription.first(); seqTestFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-trainAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; trainAlignFile = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testAlignFile = treebankDescription.first(); } else { int oldIndex = argIndex; argIndex = eOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments for english // if (!train && fOp.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lpF; // always initialized in next if-then-else block LexicalizedParser lpE; //TRAIN A PARSER // so we train a parser using the treebank GrammarCompactor compactorF = null; if (fOp.trainOptions.compactGrammar() == 3) { compactorF = new ExactGrammarCompactor(fOp, false, false); } Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF); fOp.testOptions.quietEvaluation = true; GrammarCompactor compactorE = null; if (eOp.trainOptions.compactGrammar() == 3) { compactorE = new ExactGrammarCompactor(eOp, false, false); } Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE); eOp.testOptions.quietEvaluation = true; lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE, null); // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (bitrainFilterF != null || bitrainPathF != null) { if (bitrainPathF == null) { //? if (treebankPathF == null) { throw new RuntimeException("No bitrain treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); bitrainPathF = treebankPathF; } } bitrainTreebankF = fOp.tlpParams.testMemoryTreebank(); bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF); } if (bitrainFilterE != null || bitrainPathE != null) { if (bitrainPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); bitrainPathE = treebankPathE; } } bitrainTreebankE = eOp.tlpParams.testMemoryTreebank(); bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE); } if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (testFilterF != null || testPathF != null) { if (testPathF == null) { if (treebankPathF == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); testPathF = treebankPathF; } } testTreebankF = fOp.tlpParams.testMemoryTreebank(); testTreebankF.loadPath(testPathF, testFilterF); } //generate sequioa treebank seqTestTreebank = fOp.tlpParams.testMemoryTreebank(); seqTestTreebank.loadPath(seqTestPath, seqTestFilter); fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters())); if (testFilterE != null || testPathE != null) { if (testPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); testPathE = treebankPathE; } } testTreebankE = eOp.tlpParams.testMemoryTreebank(); testTreebankE.loadPath(testPathE, testFilterE); } eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters())); ////////////////////// // // Self-Training // ////////////////////// MemoryTreebank selfTrainInitTreebank = new MemoryTreebank(); MemoryTreebank selfTrainFinalTreebank = new MemoryTreebank(); selfTrainInitTreebank.addAll(trainTreebankF); selfTrainFinalTreebank.addAll(trainTreebankF); LexicalizedParser fSelfTrainInit = getParserFromTreebank(selfTrainInitTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); int z = 0; boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages")); AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF); for (Tree goldTree : testTreebankF) { List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords()); Tree guessTree = fSelfTrainInit.parseTree(sentence); selfTrainFinalTreebank.add(guessTree); factLBf.evaluate(guessTree, goldTree); System.out.println("Self-training : " + (++z)); } LexicalizedParser fSelfTrainFinal = getParserFromTreebank(selfTrainFinalTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); EvaluateTreebank evaluatorH = new EvaluateTreebank(fSelfTrainFinal); double scoreF1 = evaluatorH.testOnTreebank(seqTestTreebank); System.out.println("------------------------"); System.out.println(" Self Train Results "); System.out.println("------------------------"); System.out.println("Test set F1: " + scoreF1); System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent()); ////////////////////// ////////////////////// //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX double[] weights = new double[8]; double diff; weights[0] = 0.01; weights[1] = -0.002; weights[2] = 0.002; weights[3] = 0.002; weights[4] = 0.002; weights[5] = 0.002; weights[6] = -0.002; weights[7] = -0.002; ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null; ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null; //String alignFile="../../berkeleyaligner/output/test.align"; try { AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile); bitrainAlignments = trainAP.createAlignments(); AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile); testAlignments = testAP.createAlignments(); } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } int kE = 10; int kF = 10; int numFeatures = 8; int numBigSentences = 0; do { diff = 0.0; Iterator<Tree> eTrees = bitrainTreebankE.iterator(); Iterator<Tree> fTrees = bitrainTreebankF.iterator(); Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator(); numBigSentences = 0; //features are used in the order they are defined double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF]; int ePsGold[] = new int[bitrainTreebankE.size()]; int fPsGold[] = new int[bitrainTreebankF.size()]; int i = 0; while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next(); Tree fTree = fTrees.next(); Tree eTree = eTrees.next(); if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) { //System.out.println("Too big : " + i); numBigSentences++; fPsGold[i] = 3; ePsGold[i] = 3; i++; continue; } List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); fPsGold[i] = 3; ePsGold[i] = 3; int j = 0; int k = 0; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); //had to reduce likelihood scores by factor of 10 to keep the optimizer working A[i][0][j][k] = eScoredObj.score() / 1000; A[i][1][j][k] = fScoredObj.score() / 1000; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); A[i][2][j][k] += spanDiff(nodeF, nodeE); A[i][3][j][k] += numChildren(nodeF, nodeE); A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap); A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap); A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap); A[i][7][j][k] += bias(nodeF, nodeE); } k++; } j++; } //System.out.println("Sentence " + i); i++; } /////////////////////// // // MALLET optimizer // /////////////////////// System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); System.out.println("Beginning convex optimization..."); System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold); Optimizer optimizer = new LimitedMemoryBFGS(optimizable); boolean converged = false; try { converged = optimizer.optimize(); } catch (IllegalArgumentException e) { // This exception may be thrown if L-BFGS // cannot step in the current direction. // This condition does not necessarily mean that // the optimizer has failed, but it doesn't want // to claim to have succeeded... } catch (cc.mallet.optimize.OptimizationException e) { System.out.println(e.getMessage()); } for (int x = 0; x < weights.length; x++) { diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]); weights[x] = optimizable.getParameter(x); System.out.print(weights[x] + ", "); } System.out.println(); diff /= weights.length; System.out.println("Current difference: " + diff); } while (diff > 0.0005); //GENERATE TRAINING DATA USING KLEIN RERANKER //assumes the 'test' data from KleinBilingualParser.java is the unannotated data //that the reranker has to annotate. factLBf = new Evalb("factor LP/LR", runningAveragesF); MemoryTreebank eddyRoseFullTrainTreebank = new MemoryTreebank(); eddyRoseFullTrainTreebank.addAll(trainTreebankF); eddyRoseFullTrainTreebank.addAll(bitrainTreebankF); Treebank unannotTreebankF = testTreebankF; Treebank annotTreebankE = testTreebankE; Iterator<Tree> eTreesBling = unannotTreebankF.iterator(); Iterator<Tree> fTreesBling = annotTreebankE.iterator(); int i = 0; Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator(); while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next(); Tree fTree = fTreesBling.next(); Tree eTree = eTreesBling.next(); List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) fSelfTrainFinal.parserQuery(); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); int j = 0; int k = 0; double maxScore = -Double.MAX_VALUE; Tree bestFtree = null; for (ScoredObject<Tree> fScoredObj : kBestF) { eTree.setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eTree, fScoredObj.object(), weights, alignMap); double currentScore = 0.0; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); currentScore += weights[0] * 0.0;//because gold standard tree is assumed to have probability 1 currentScore += weights[1] * fScoredObj.score() / 1000; currentScore += weights[2] * spanDiff(nodeF, nodeE); currentScore += weights[3] * numChildren(nodeF, nodeE); currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap); currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap); currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap); currentScore += weights[7] * bias(nodeF, nodeE); } if (currentScore > maxScore) { maxScore = currentScore; bestFtree = fScoredObj.object(); } k++; } i++; System.out.println("Reranker " + i); eddyRoseFullTrainTreebank.add(bestFtree); factLBf.evaluate(bestFtree, fTree); } LexicalizedParser lpEddyRose = getParserFromTreebank(eddyRoseFullTrainTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); EvaluateTreebank evaluator = new EvaluateTreebank(lpEddyRose); double eddyRoseF1 = evaluator.testOnTreebank(seqTestTreebank); System.out.println("------------------------"); System.out.println(" EddyRose Results "); System.out.println("------------------------"); System.out.println("Test set F1: " + eddyRoseF1); System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent()); }
From source file:com.github.kutschkem.Qgen.QuestionRankerByParseProbability.java
License:Open Source License
public List<Question> rank(List<Question> questions) { System.out.println("afterPipeline:" + questions.size()); final Map<Question, Double> scores = new HashMap<Question, Double>(); for (Question q : questions) { List<HasWord> tokens = new DocumentPreprocessor(new StringReader(q.Question)).iterator().next(); LexicalizedParserQuery query = parser.parserQuery(); query.parse(tokens);/*from w w w . j a v a2 s .c om*/ scores.put(q, average(query.getKBestPCFGParses(3))); } List<Question> result = new ArrayList<Question>(questions); Collections.sort(result, new Comparator<Question>() { public int compare(Question o1, Question o2) { return -scores.get(o1).compareTo(scores.get(o2)); } }); for (Question q : result) { System.out.println(q.Question + " " + scores.get(q)); } return result; }