List of usage examples for edu.stanford.nlp.trees Tree yieldWords
public ArrayList<Word> yieldWords()
From source file:LexicalizedParserUnsupervisedDA.java
public static void main(String[] args) { boolean seed = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPath = null;/*from w w w. java2 s. co m*/ Treebank selfTrainTreebank = null; MemoryTreebank finalTrainTreebank = null; Treebank tuneTreebank = null; String testPath = null; String inTestPath = null; String selfTrainPath = null; FileFilter testFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilter = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); List<String> optionArgs = new ArrayList<>(); String encoding = null; // while loop through option arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-inTest")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-inTest"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; inTestPath = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPath = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-seed")) { seed = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-seed"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPath = treebankDescription.first(); trainFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-train2")) { // train = true; // cdm july 2005: should require -train for this Triple<String, FileFilter, Double> treebankDescription = ArgUtils .getWeightedTreebankDescription(args, argIndex, "-train2"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; secondaryTreebankPath = treebankDescription.first(); secondaryTrainFilter = treebankDescription.second(); secondaryTreebankWeight = treebankDescription.third(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encoding = args[argIndex + 1]; op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenized")) { tokenized = true; argIndex += 1; } else if (args[argIndex].equalsIgnoreCase("-escaper")) { try { escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]); } catch (Exception e) { log.info("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) { tokenizerOptions = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) { tokenizerFactoryClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) { tokenizerMethod = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentences")) { sentenceDelimiter = args[argIndex + 1]; if (sentenceDelimiter.equalsIgnoreCase("newline")) { sentenceDelimiter = "\n"; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parseInside")) { elementDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) { tagDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // load the parser from declarative text file // the next argument must be the path to the parser file textInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) { saveToSerializedFile = true; if (ArgUtils.numSubArgs(args, argIndex) < 1) { log.info("Missing path: -saveToSerialized filename"); } else { serializedOutputFileOrUrl = args[argIndex + 1]; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) { // save the training trees to a binary file op.trainOptions.trainTreeFile = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-selfTrain")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-selfTrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; selfTrainPath = treebankDescription.first(); testFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tune")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; tunePath = treebankDescription.first(); tuneFilter = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = op.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments // all other arguments are order dependent and // are processed in order below if (tuneFilter != null || tunePath != null) { if (tunePath == null) { if (treebankPath == null) { throw new RuntimeException("No tune treebank path specified..."); } else { log.info("No tune treebank path specified. Using train path: \"" + treebankPath + '\"'); tunePath = treebankPath; } } tuneTreebank = op.tlpParams.testMemoryTreebank(); tuneTreebank.loadPath(tunePath, tuneFilter); } // if (!train && op.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lp; // always initialized in next if-then-else block if (seed) { //StringUtils.logInvocationString(log, args); // so we train a parser using the treebank GrammarCompactor compactor = null; if (op.trainOptions.compactGrammar() == 3) { compactor = new ExactGrammarCompactor(op, false, false); } Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter); finalTrainTreebank = new MemoryTreebank(); finalTrainTreebank.addAll(trainTreebank); Treebank secondaryTrainTreebank = null; if (secondaryTreebankPath != null) { secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter); } List<List<TaggedWord>> extraTaggedWords = null; if (op.trainOptions.taggedFiles != null) { extraTaggedWords = new ArrayList<>(); List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles); for (TaggedFileRecord record : fileRecords) { for (List<TaggedWord> sentence : record.reader()) { extraTaggedWords.add(sentence); } } } op.testOptions.quietEvaluation = true; lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords); } else if (textInputFileOrUrl != null) { // so we load the parser from a text grammar file lp = getParserFromTextFile(textInputFileOrUrl, op); } else { // so we load a serialized parser - if (serializedInputFileOrUrl == null && argIndex < args.length) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } if (serializedInputFileOrUrl == null) { log.info("No grammar specified, exiting..."); return; } String[] extraArgs = new String[optionArgs.size()]; extraArgs = optionArgs.toArray(extraArgs); try { lp = loadModel(serializedInputFileOrUrl, op, extraArgs); op.setOptions(extraArgs);//CHANGED } catch (IllegalArgumentException e) { log.info("Error loading parser, exiting..."); throw e; } } // set up tokenizerFactory with options if provided if (tokenizerFactoryClass != null || tokenizerOptions != null) { try { if (tokenizerFactoryClass != null) { Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils .uncheckedCast(Class.forName(tokenizerFactoryClass)); Method factoryMethod; if (tokenizerOptions != null) { factoryMethod = clazz.getMethod( tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions)); } else { factoryMethod = clazz .getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory"); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null)); } } else { // have options but no tokenizer factory. use the parser // langpack's factory and set its options tokenizerFactory = op.langpack().getTokenizerFactory(); tokenizerFactory.setOptions(tokenizerOptions); } } catch (IllegalAccessException | InvocationTargetException | ClassNotFoundException | NoSuchMethodException e) { log.info("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + tokenizerOptions); throw new RuntimeException(e); } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encoding != null) { op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); } if (testFilter != null || selfTrainPath != null) { if (selfTrainPath == null) { if (treebankPath == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPath + '\"'); selfTrainPath = treebankPath; } } selfTrainTreebank = op.tlpParams.testMemoryTreebank(); selfTrainTreebank.loadPath(selfTrainPath, testFilter); } op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. -- Roger // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { lp.saveParserToTextFile(textOutputFileOrUrl); } else { log.info("Usage: must specify a text grammar output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl != null) { lp.saveParserToSerialized(serializedOutputFileOrUrl); } else if (textOutputFileOrUrl == null && selfTrainTreebank == null) { // no saving/parsing request has been specified log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-seed trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename"); } } if (op.testOptions.verbose || seed) { // Tell the user a little or a lot about what we have made // get lexicon size separately as it may have its own prints in it.... String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : ""; log.info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); log.info("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules() : "") + '\t' + (lp.bg != null ? lp.bg.numRules() : "") + '\t' + lexNumRules); log.info("ParserPack is " + op.tlpParams.getClass().getName()); log.info("Lexicon is " + lp.lex.getClass().getName()); if (op.testOptions.verbose) { log.info("Tags are: " + lp.tagIndex); // log.info("States are: " + lp.pd.stateIndex); // This is too verbose. It was already printed out by the below printOptions command if the flag -printStates is given (at training time)! } printOptions(false, op); } if (selfTrainTreebank != null) { Treebank selfTrainTest = makeTreebank(testPath, op, null); Treebank inTest = makeTreebank(inTestPath, op, null); EvaluateTreebank evaluator = new EvaluateTreebank(lp); double baseLineOutDomain = evaluator.testOnTreebank(selfTrainTest); double baseLineInDomain = evaluator.testOnTreebank(inTest); // annotate unlabeled data System.out.println("Starting selftraining..."); int i = 0; for (Tree goldTree : selfTrainTreebank) { List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords()); ; finalTrainTreebank.add(lp.parseTree(sentence)); System.out.println("Self-training : " + (++i)); } System.out.println("Finished creating the final dataset"); GrammarCompactor compactor = null; if (op.trainOptions.compactGrammar() == 3) { compactor = new ExactGrammarCompactor(op, false, false); } op.testOptions.quietEvaluation = true; lp = getParserFromTreebank(finalTrainTreebank, null, 1.0, compactor, op, tuneTreebank, null); evaluator = new EvaluateTreebank(lp); double finalF1 = evaluator.testOnTreebank(selfTrainTest); System.out.println("------------------------"); System.out.println("The results that matter:"); System.out.println("------------------------"); System.out.println("Baseline In Domain F1 : " + baseLineInDomain); System.out.println("Baseline Out Domain F1 : " + baseLineOutDomain); System.out.println("Self-Trained Out Domain F1 : " + finalF1); } else if (argIndex >= args.length) { // no more arguments, so we just parse our own test sentence PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); ParserQuery pq = lp.parserQuery(); if (pq.parse(op.tlpParams.defaultTestSentence())) { lp.getTreePrint().printTree(pq.getBestParse(), pwOut); } else { pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence()); } } else { // We parse filenames given by the remaining arguments ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp); } }
From source file:KleinBilingualParser.java
public static void main(String[] args) { boolean trainF = false; boolean trainE = false; boolean bitrainE = false; boolean bitrainF = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPathF = null;/* w w w .j av a2 s . c o m*/ Treebank testTreebankF = null; Treebank tuneTreebankF = null; String testPathF = null; FileFilter testFilterF = null; String treebankPathE = null; Treebank testTreebankE = null; Treebank tuneTreebankE = null; String testPathE = null; FileFilter testFilterE = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilterF = null; FileFilter trainFilterE = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; String trainAlignFile = null; String testAlignFile = null; String bitrainPathE = null; FileFilter bitrainFilterE = null; String bitrainPathF = null; FileFilter bitrainFilterF = null; Treebank bitrainTreebankF = null; Treebank bitrainTreebankE = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options fOp = new Options(); Options eOp = new Options(); List<String> optionArgs = new ArrayList<>(); String encodingF = null; // while loop through option arguments while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathF = treebankDescription.first(); trainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathF = treebankDescription.first(); bitrainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encodingF = args[argIndex + 1]; fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathF = treebankDescription.first(); testFilterF = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = fOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } System.out.println(argIndex + " " + args.length); } // end while loop through arguments for french argIndex++;//go to english arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathE = treebankDescription.first(); trainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathE = treebankDescription.first(); bitrainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathE = treebankDescription.first(); testFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-trainAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; trainAlignFile = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testAlignFile = treebankDescription.first(); } else { int oldIndex = argIndex; argIndex = eOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments for english // if (!train && fOp.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lpF; // always initialized in next if-then-else block LexicalizedParser lpE; //TRAIN A PARSER // so we train a parser using the treebank GrammarCompactor compactorF = null; GrammarCompactor compactorE = null; if (fOp.trainOptions.compactGrammar() == 3) { compactorF = new ExactGrammarCompactor(fOp, false, false); } if (eOp.trainOptions.compactGrammar() == 3) { compactorE = new ExactGrammarCompactor(eOp, false, false); } Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF); Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE); fOp.testOptions.quietEvaluation = true; eOp.testOptions.quietEvaluation = true; lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE, null); // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (bitrainFilterF != null || bitrainPathF != null) { if (bitrainPathF == null) { //? if (treebankPathF == null) { throw new RuntimeException("No bitrain treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); bitrainPathF = treebankPathF; } } bitrainTreebankF = fOp.tlpParams.testMemoryTreebank(); bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF); } if (bitrainFilterE != null || bitrainPathE != null) { if (bitrainPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); bitrainPathE = treebankPathE; } } bitrainTreebankE = eOp.tlpParams.testMemoryTreebank(); bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE); } if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (testFilterF != null || testPathF != null) { if (testPathF == null) { if (treebankPathF == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); testPathF = treebankPathF; } } testTreebankF = fOp.tlpParams.testMemoryTreebank(); testTreebankF.loadPath(testPathF, testFilterF); } fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters())); if (testFilterE != null || testPathE != null) { if (testPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); testPathE = treebankPathE; } } testTreebankE = eOp.tlpParams.testMemoryTreebank(); testTreebankE.loadPath(testPathE, testFilterE); } eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters())); //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX double[] weights = new double[8]; double diff; weights[0] = 0.01; weights[1] = -0.002; weights[2] = 0.002; weights[3] = 0.002; weights[4] = 0.002; weights[5] = 0.002; weights[6] = -0.002; weights[7] = -0.002; ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null; ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null; //String alignFile="../../berkeleyaligner/output/test.align"; try { AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile); bitrainAlignments = trainAP.createAlignments(); AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile); testAlignments = testAP.createAlignments(); } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } int kE = 10; int kF = 10; int numFeatures = 8; int numBigSentences = 0; do { diff = 0.0; Iterator<Tree> eTrees = bitrainTreebankE.iterator(); Iterator<Tree> fTrees = bitrainTreebankF.iterator(); Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator(); numBigSentences = 0; //features are used in the order they are defined double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF]; int ePsGold[] = new int[bitrainTreebankE.size()]; int fPsGold[] = new int[bitrainTreebankF.size()]; int i = 0; while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next(); Tree fTree = fTrees.next(); Tree eTree = eTrees.next(); if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) { //System.out.println("Too big : " + i); numBigSentences++; fPsGold[i] = 3; ePsGold[i] = 3; i++; continue; } List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); fPsGold[i] = 3; ePsGold[i] = 3; int j = 0; int k = 0; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); //had to reduce likelihood scores by factor of 10 to keep the optimizer working A[i][0][j][k] = eScoredObj.score() / 1000; A[i][1][j][k] = fScoredObj.score() / 1000; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); A[i][2][j][k] += spanDiff(nodeF, nodeE); A[i][3][j][k] += numChildren(nodeF, nodeE); A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap); A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap); A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap); A[i][7][j][k] += bias(nodeF, nodeE); } k++; } j++; } //System.out.println("Sentence " + i); i++; } /////////////////////// // // MALLET optimizer // /////////////////////// System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); System.out.println("Beginning convex optimization..."); System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold); Optimizer optimizer = new LimitedMemoryBFGS(optimizable); boolean converged = false; try { converged = optimizer.optimize(); } catch (IllegalArgumentException e) { // This exception may be thrown if L-BFGS // cannot step in the current direction. // This condition does not necessarily mean that // the optimizer has failed, but it doesn't want // to claim to have succeeded... } catch (cc.mallet.optimize.OptimizationException e) { System.out.println(e.getMessage()); } for (int x = 0; x < weights.length; x++) { diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]); weights[x] = optimizable.getParameter(x); System.out.print(weights[x] + ", "); } System.out.println(); diff /= weights.length; System.out.println("Current difference: " + diff); } while (diff > 0.0005); //TESTING BILINGUAL PARSER Treebank bilingTestTreebankF = testTreebankF; Treebank bilingTestTreebankE = testTreebankE; Iterator<Tree> eTreesBling = testTreebankE.iterator(); Iterator<Tree> fTreesBling = testTreebankF.iterator(); boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages")); boolean runningAveragesE = Boolean.parseBoolean(eOp.testOptions.evals.getProperty("runningAverages")); AbstractEval pcfgLBf = new Evalb("pcfg LP/LR", runningAveragesF); AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF); AbstractEval pcfgLBe = new Evalb("pcfg LP/LR", runningAveragesE); AbstractEval factLBe = new Evalb("factor LP/LR", runningAveragesE); int i = 0; Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator(); while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next(); Tree fTree = fTreesBling.next(); Tree eTree = eTreesBling.next(); List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); int j = 0; int k = 0; double maxScore = -Double.MAX_VALUE; Tree bestFtree = null; Tree bestEtree = null; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); double currentScore = 0.0; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); currentScore += weights[0] * eScoredObj.score() / 1000; currentScore += weights[1] * fScoredObj.score() / 1000; currentScore += weights[2] * spanDiff(nodeF, nodeE); currentScore += weights[3] * numChildren(nodeF, nodeE); currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap); currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap); currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap); currentScore += weights[7] * bias(nodeF, nodeE); } if (currentScore > maxScore) { maxScore = currentScore; bestFtree = fScoredObj.object(); bestEtree = eScoredObj.object(); } k++; } j++; } i++; pcfgLBe.evaluate(bestEtree, eTree); factLBe.evaluate(bestEtree, eTree); pcfgLBf.evaluate(bestFtree, fTree); factLBf.evaluate(bestFtree, fTree); } System.out.println("------------------------"); System.out.println(" English Results "); System.out.println("------------------------"); System.out.println("PCFG labeled f1: " + pcfgLBe.getEvalbF1Percent()); System.out.println("Factored labeled f1: " + factLBe.getEvalbF1Percent()); System.out.println("------------------------"); System.out.println(" French Results "); System.out.println("------------------------"); System.out.println("PCFG labeled f1: " + pcfgLBf.getEvalbF1Percent()); System.out.println("Factored labeled f1: " + factLBf.getEvalbF1Percent()); System.out.println("------------------------"); System.out.println("Number of sentences too big: " + numBigSentences); }
From source file:EddyRoseDomainAdaptation.java
public static void main(String[] args) { boolean trainF = false; boolean trainE = false; boolean bitrainE = false; boolean bitrainF = false; boolean saveToSerializedFile = false; boolean saveToTextFile = false; String serializedInputFileOrUrl = null; String textInputFileOrUrl = null; String serializedOutputFileOrUrl = null; String textOutputFileOrUrl = null; String treebankPathF = null;//ww w. j a va 2 s. com Treebank testTreebankF = null; Treebank seqTestTreebank = null; Treebank tuneTreebankF = null; String testPathF = null; FileFilter testFilterF = null; String treebankPathE = null; Treebank testTreebankE = null; Treebank tuneTreebankE = null; String testPathE = null; FileFilter testFilterE = null; String seqTestPath = null; FileFilter seqTestFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilterF = null; FileFilter trainFilterE = null; String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; String trainAlignFile = null; String testAlignFile = null; String bitrainPathE = null; FileFilter bitrainFilterE = null; String bitrainPathF = null; FileFilter bitrainFilterF = null; Treebank bitrainTreebankF = null; Treebank bitrainTreebankE = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; String tokenizerOptions = null; String tokenizerFactoryClass = null; String tokenizerMethod = null; boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; String tagDelimiter = null; String sentenceDelimiter = null; String elementDelimiter = null; int argIndex = 0; if (args.length < 1) { log.info( "Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*"); return; } Options fOp = new Options(); Options eOp = new Options(); List<String> optionArgs = new ArrayList<>(); String encodingF = null; // while loop through option arguments while (!args[argIndex].equals("--") && argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathF = treebankDescription.first(); trainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainF = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathF = treebankDescription.first(); bitrainFilterF = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { try { fOp.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { log.info("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { log.info("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encodingF = args[argIndex + 1]; fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathF = treebankDescription.first(); testFilterF = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = fOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } System.out.println(argIndex + " " + args.length); } // end while loop through arguments for french argIndex++;//go to english arguments while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) { trainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-train"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPathE = treebankDescription.first(); trainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-bitrain") || args[argIndex].equalsIgnoreCase("-bitrainTreebank")) { bitrainE = true; Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-bitrain"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; bitrainPathE = treebankDescription.first(); bitrainFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPathE = treebankDescription.first(); testFilterE = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-seqtest")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; seqTestPath = treebankDescription.first(); seqTestFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-trainAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-trainAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; trainAlignFile = treebankDescription.first(); } else if (args[argIndex].equalsIgnoreCase("-testAlignFile")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testAlignFile"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testAlignFile = treebankDescription.first(); } else { int oldIndex = argIndex; argIndex = eOp.setOptionOrWarn(args, argIndex); optionArgs.addAll(Arrays.asList(args).subList(oldIndex, argIndex)); } } // end while loop through arguments for english // if (!train && fOp.testOptions.verbose) { // StringUtils.logInvocationString(log, args); // } LexicalizedParser lpF; // always initialized in next if-then-else block LexicalizedParser lpE; //TRAIN A PARSER // so we train a parser using the treebank GrammarCompactor compactorF = null; if (fOp.trainOptions.compactGrammar() == 3) { compactorF = new ExactGrammarCompactor(fOp, false, false); } Treebank trainTreebankF = makeTreebank(treebankPathF, fOp, trainFilterF); fOp.testOptions.quietEvaluation = true; GrammarCompactor compactorE = null; if (eOp.trainOptions.compactGrammar() == 3) { compactorE = new ExactGrammarCompactor(eOp, false, false); } Treebank trainTreebankE = makeTreebank(treebankPathE, eOp, trainFilterE); eOp.testOptions.quietEvaluation = true; lpF = getParserFromTreebank(trainTreebankF, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); lpE = getParserFromTreebank(trainTreebankE, null, secondaryTreebankWeight, compactorE, eOp, tuneTreebankE, null); // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (bitrainFilterF != null || bitrainPathF != null) { if (bitrainPathF == null) { //? if (treebankPathF == null) { throw new RuntimeException("No bitrain treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); bitrainPathF = treebankPathF; } } bitrainTreebankF = fOp.tlpParams.testMemoryTreebank(); bitrainTreebankF.loadPath(bitrainPathF, bitrainFilterF); } if (bitrainFilterE != null || bitrainPathE != null) { if (bitrainPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); bitrainPathE = treebankPathE; } } bitrainTreebankE = eOp.tlpParams.testMemoryTreebank(); bitrainTreebankE.loadPath(bitrainPathE, bitrainFilterE); } if (encodingF != null) { fOp.tlpParams.setInputEncoding(encodingF); fOp.tlpParams.setOutputEncoding(encodingF); } if (testFilterF != null || testPathF != null) { if (testPathF == null) { if (treebankPathF == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathF + '\"'); testPathF = treebankPathF; } } testTreebankF = fOp.tlpParams.testMemoryTreebank(); testTreebankF.loadPath(testPathF, testFilterF); } //generate sequioa treebank seqTestTreebank = fOp.tlpParams.testMemoryTreebank(); seqTestTreebank.loadPath(seqTestPath, seqTestFilter); fOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(fOp.tlpParams.sisterSplitters())); if (testFilterE != null || testPathE != null) { if (testPathE == null) { if (treebankPathE == null) { throw new RuntimeException("No test treebank path specified..."); } else { log.info("No test treebank path specified. Using train path: \"" + treebankPathE + '\"'); testPathE = treebankPathE; } } testTreebankE = eOp.tlpParams.testMemoryTreebank(); testTreebankE.loadPath(testPathE, testFilterE); } eOp.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(eOp.tlpParams.sisterSplitters())); ////////////////////// // // Self-Training // ////////////////////// MemoryTreebank selfTrainInitTreebank = new MemoryTreebank(); MemoryTreebank selfTrainFinalTreebank = new MemoryTreebank(); selfTrainInitTreebank.addAll(trainTreebankF); selfTrainFinalTreebank.addAll(trainTreebankF); LexicalizedParser fSelfTrainInit = getParserFromTreebank(selfTrainInitTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); int z = 0; boolean runningAveragesF = Boolean.parseBoolean(fOp.testOptions.evals.getProperty("runningAverages")); AbstractEval factLBf = new Evalb("factor LP/LR", runningAveragesF); for (Tree goldTree : testTreebankF) { List<? extends HasWord> sentence = Sentence.toCoreLabelList(goldTree.yieldWords()); Tree guessTree = fSelfTrainInit.parseTree(sentence); selfTrainFinalTreebank.add(guessTree); factLBf.evaluate(guessTree, goldTree); System.out.println("Self-training : " + (++z)); } LexicalizedParser fSelfTrainFinal = getParserFromTreebank(selfTrainFinalTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); EvaluateTreebank evaluatorH = new EvaluateTreebank(fSelfTrainFinal); double scoreF1 = evaluatorH.testOnTreebank(seqTestTreebank); System.out.println("------------------------"); System.out.println(" Self Train Results "); System.out.println("------------------------"); System.out.println("Test set F1: " + scoreF1); System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent()); ////////////////////// ////////////////////// //PARALLEL ALIGNMENT FEATURE CALCULATION, CALCULATION OF 'A' MATRIX double[] weights = new double[8]; double diff; weights[0] = 0.01; weights[1] = -0.002; weights[2] = 0.002; weights[3] = 0.002; weights[4] = 0.002; weights[5] = 0.002; weights[6] = -0.002; weights[7] = -0.002; ArrayList<HashMap<Integer, ArrayList<Integer>>> bitrainAlignments = null; ArrayList<HashMap<Integer, ArrayList<Integer>>> testAlignments = null; //String alignFile="../../berkeleyaligner/output/test.align"; try { AlignmentProcessor trainAP = new AlignmentProcessor(trainAlignFile); bitrainAlignments = trainAP.createAlignments(); AlignmentProcessor testAP = new AlignmentProcessor(testAlignFile); testAlignments = testAP.createAlignments(); } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } int kE = 10; int kF = 10; int numFeatures = 8; int numBigSentences = 0; do { diff = 0.0; Iterator<Tree> eTrees = bitrainTreebankE.iterator(); Iterator<Tree> fTrees = bitrainTreebankF.iterator(); Iterator<HashMap<Integer, ArrayList<Integer>>> alignIterator = bitrainAlignments.iterator(); numBigSentences = 0; //features are used in the order they are defined double A[][][][] = new double[bitrainTreebankE.size()][numFeatures][kE][kF]; int ePsGold[] = new int[bitrainTreebankE.size()]; int fPsGold[] = new int[bitrainTreebankF.size()]; int i = 0; while (eTrees.hasNext() && fTrees.hasNext() && alignIterator.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIterator.next(); Tree fTree = fTrees.next(); Tree eTree = eTrees.next(); if (fTree.getLeaves().size() > 70 || fTree.getLeaves().size() > 70) { //System.out.println("Too big : " + i); numBigSentences++; fPsGold[i] = 3; ePsGold[i] = 3; i++; continue; } List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); List<? extends HasWord> sentenceE = Sentence.toCoreLabelList(eTree.yieldWords()); LexicalizedParserQuery lpqE = (LexicalizedParserQuery) lpE.parserQuery(); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) lpF.parserQuery(); lpqE.parse(sentenceE); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); List<ScoredObject<Tree>> kBestE = lpqE.getKBestPCFGParses(kE); fPsGold[i] = 3; ePsGold[i] = 3; int j = 0; int k = 0; for (ScoredObject<Tree> eScoredObj : kBestE) { k = 0; for (ScoredObject<Tree> fScoredObj : kBestF) { eScoredObj.object().setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eScoredObj.object(), fScoredObj.object(), weights, alignMap); //had to reduce likelihood scores by factor of 10 to keep the optimizer working A[i][0][j][k] = eScoredObj.score() / 1000; A[i][1][j][k] = fScoredObj.score() / 1000; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); A[i][2][j][k] += spanDiff(nodeF, nodeE); A[i][3][j][k] += numChildren(nodeF, nodeE); A[i][4][j][k] += insideBoth(nodeF, nodeE, alignMap); A[i][5][j][k] += insideSrcOutsideTgt(nodeF, nodeE, alignMap); A[i][6][j][k] += insideTgtOutsideSrc(nodeF, nodeE, alignMap); A[i][7][j][k] += bias(nodeF, nodeE); } k++; } j++; } //System.out.println("Sentence " + i); i++; } /////////////////////// // // MALLET optimizer // /////////////////////// System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); System.out.println("Beginning convex optimization..."); System.out.println(); System.out.println("*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*"); System.out.println(); OptimizerExample optimizable = new OptimizerExample(weights, A, ePsGold, fPsGold); Optimizer optimizer = new LimitedMemoryBFGS(optimizable); boolean converged = false; try { converged = optimizer.optimize(); } catch (IllegalArgumentException e) { // This exception may be thrown if L-BFGS // cannot step in the current direction. // This condition does not necessarily mean that // the optimizer has failed, but it doesn't want // to claim to have succeeded... } catch (cc.mallet.optimize.OptimizationException e) { System.out.println(e.getMessage()); } for (int x = 0; x < weights.length; x++) { diff += (optimizable.getParameter(x) - weights[x]) * (optimizable.getParameter(x) - weights[x]); weights[x] = optimizable.getParameter(x); System.out.print(weights[x] + ", "); } System.out.println(); diff /= weights.length; System.out.println("Current difference: " + diff); } while (diff > 0.0005); //GENERATE TRAINING DATA USING KLEIN RERANKER //assumes the 'test' data from KleinBilingualParser.java is the unannotated data //that the reranker has to annotate. factLBf = new Evalb("factor LP/LR", runningAveragesF); MemoryTreebank eddyRoseFullTrainTreebank = new MemoryTreebank(); eddyRoseFullTrainTreebank.addAll(trainTreebankF); eddyRoseFullTrainTreebank.addAll(bitrainTreebankF); Treebank unannotTreebankF = testTreebankF; Treebank annotTreebankE = testTreebankE; Iterator<Tree> eTreesBling = unannotTreebankF.iterator(); Iterator<Tree> fTreesBling = annotTreebankE.iterator(); int i = 0; Iterator<HashMap<Integer, ArrayList<Integer>>> alignIteratorTEST = testAlignments.iterator(); while (eTreesBling.hasNext() && fTreesBling.hasNext() && alignIteratorTEST.hasNext()) { HashMap<Integer, ArrayList<Integer>> alignMap = alignIteratorTEST.next(); Tree fTree = fTreesBling.next(); Tree eTree = eTreesBling.next(); List<? extends HasWord> sentenceF = Sentence.toCoreLabelList(fTree.yieldWords()); LexicalizedParserQuery lpqF = (LexicalizedParserQuery) fSelfTrainFinal.parserQuery(); lpqF.parse(sentenceF); List<ScoredObject<Tree>> kBestF = lpqF.getKBestPCFGParses(kF); int j = 0; int k = 0; double maxScore = -Double.MAX_VALUE; Tree bestFtree = null; for (ScoredObject<Tree> fScoredObj : kBestF) { eTree.setSpans(); fScoredObj.object().setSpans(); HashMap<Tree, Tree> alignment = getHungarianAlignment(eTree, fScoredObj.object(), weights, alignMap); double currentScore = 0.0; for (Map.Entry entry : alignment.entrySet()) { Tree nodeF = (Tree) entry.getKey(); Tree nodeE = (Tree) entry.getValue(); currentScore += weights[0] * 0.0;//because gold standard tree is assumed to have probability 1 currentScore += weights[1] * fScoredObj.score() / 1000; currentScore += weights[2] * spanDiff(nodeF, nodeE); currentScore += weights[3] * numChildren(nodeF, nodeE); currentScore += weights[4] * insideBoth(nodeF, nodeE, alignMap); currentScore += weights[5] * insideSrcOutsideTgt(nodeF, nodeE, alignMap); currentScore += weights[6] * insideTgtOutsideSrc(nodeF, nodeE, alignMap); currentScore += weights[7] * bias(nodeF, nodeE); } if (currentScore > maxScore) { maxScore = currentScore; bestFtree = fScoredObj.object(); } k++; } i++; System.out.println("Reranker " + i); eddyRoseFullTrainTreebank.add(bestFtree); factLBf.evaluate(bestFtree, fTree); } LexicalizedParser lpEddyRose = getParserFromTreebank(eddyRoseFullTrainTreebank, null, secondaryTreebankWeight, compactorF, fOp, tuneTreebankF, null); EvaluateTreebank evaluator = new EvaluateTreebank(lpEddyRose); double eddyRoseF1 = evaluator.testOnTreebank(seqTestTreebank); System.out.println("------------------------"); System.out.println(" EddyRose Results "); System.out.println("------------------------"); System.out.println("Test set F1: " + eddyRoseF1); System.out.println("F1 on projected training data: " + factLBf.getEvalbF1Percent()); }
From source file:edu.cornell.law.entitylinking.utils.Utility.java
public static List<String> getAllNounPhrases(String paragraph) { List<String> nounPhrases = new ArrayList<String>(); try {//from w w w. j a v a2s .co m StringTokenizer tokenizer = new StringTokenizer(paragraph, "\\.;?:,"); while (tokenizer.hasMoreTokens()) { Annotation document = new Annotation(tokenizer.nextToken()); pipeline.annotate(document); Tree tree = null; List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // this is the parse tree of the current sentence tree = sentence.get(TreeAnnotation.class); for (Tree subtree : tree) { if ((subtree.label().value().equals("NP")) || (subtree.label().value().equals("WHNP"))) { String phraseString = Sentence.listToString(subtree.yieldWords()) .replace(" -LRB- ", "(").replace(" -RRB- ", ")"); String temp = phraseString.trim(); if (temp.startsWith("(?i)the")) temp = temp.replaceFirst("(?i)the ", ""); else if (temp.startsWith("(?i)a")) temp = temp.replaceFirst("(?i)a ", ""); else if (temp.startsWith("(?i)an")) temp = temp.replaceFirst("(?i)an ", ""); if (subtree.getChildrenAsList().contains(tree.label().value().equals("NN"))) { //System.out.println("PHRASE"); } if (temp.contains(" or ")) { String[] nptokens = temp.split(" or "); for (String s : nptokens) { nounPhrases.add(s); } } else { nounPhrases.add(temp); } } } } } } catch (OutOfMemoryError e) { System.out.println("Result too long to read into memory"); } return nounPhrases; }
From source file:org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule.java
License:Open Source License
private static List<Word> appendWordsFromTree(List<Word> words, Tree tree) { List<Word> res = new ArrayList<Word>(); res.addAll(words);/*w w w . j ava 2s . c o m*/ TregexPattern p = TregexPattern.compile(tree.value() + " <<, NNP|NNPS"); TregexMatcher matcher = p.matcher(tree); boolean isFirst = true; for (Word word : tree.yieldWords()) { if ((isFirst) && (!matcher.findAt(tree))) { res.add(WordsUtils.lowercaseWord(word)); } else { res.add(word); } isFirst = false; } return res; }
From source file:org.lambda3.text.simplification.discourse.utils.ner.NERStringParser.java
License:Open Source License
public static TNERString parse(Tree parseTree) throws NERStringParseException { List<TNERToken> tokens = new ArrayList<>(); List<Integer> parseTreeLeafNumbers = ParseTreeExtractionUtils.getLeafNumbers(parseTree, parseTree); String nerString = NER_CLASSIFIER.classifyToString(WordsUtils.wordsToString(parseTree.yieldWords())); String[] nerTokens = nerString.split(" "); if (parseTreeLeafNumbers.size() != nerTokens.length) { throw new NERStringParseException("Could not map NER string to parseTree"); }/*from w ww .ja va 2s .co m*/ int idx = 0; for (String nerToken : nerTokens) { int sep_idx = nerToken.lastIndexOf("/"); // create token String text = nerToken.substring(0, sep_idx); String category = nerToken.substring(sep_idx + 1); TNERToken token = new TNERToken(idx, text, category, parseTree.getNodeNumber(parseTreeLeafNumbers.get(idx))); tokens.add(token); ++idx; } return new TNERString(tokens, parseTree); }
From source file:phrasesentimentextractor.PhraseSentimentExtractor.java
/** * @param args the command line arguments *//*from w ww . j av a2 s . c o m*/ public static void main(String[] args) throws FileNotFoundException, IOException { // TODO code application logic here //Initialize all the models //Tokenizer model for the sentence from OpenNLP , tokenizes the sentence // InputStream is = new FileInputStream("en-token.bin"); // // TokenizerModel model = new TokenizerModel(is); // Tokenizer tokenizer = new TokenizerME(model); // // //POS model from OpenNLP, gives the POS tags // POSModel posmodel = new POSModelLoader().load(new File("en-pos-maxent.bin")); // POSTaggerME tagger = new POSTaggerME(posmodel); DependencyTreeGenerator dr = DependencyTreeGenerator.getInstance(); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); //chunker Path filepath = Paths.get("models/en-chunker.bin"); InputStream is = new FileInputStream(filepath.toFile()); ChunkerModel cModel = new ChunkerModel(is); ChunkerME chunkerME = new ChunkerME(cModel); //Output file File output_phrases = new File(args[2]); FileWriter fout = new FileWriter(output_phrases); PrintWriter out = new PrintWriter(fout); //Start processing the review file //Extract all the features Set<String> features = new HashSet(); HashMap<String, List<String>> featuresPhrases = new HashMap(); File feat_input = new File(args[0]); Scanner scanner = new Scanner(feat_input); int feat_counter = 0; String feat = ""; while (scanner.hasNext()) { feat = scanner.nextLine().trim(); features.add(feat); List<String> f_phrases = new ArrayList(); featuresPhrases.put(feat, f_phrases); feat_counter++; } String sentence = ""; File review_text = new File(args[1]); FileReader fileReader = new FileReader(review_text); DocumentPreprocessor dp = new DocumentPreprocessor(fileReader); dp.setTokenizerFactory(tokenizerFactory); int num_lines = 0; for (List line : dp) { boolean feature_exists = false; // sentence = Sentence.listToString(line); Set<String> check_features = new HashSet(); for (String feature : features) { Pattern pattern = Pattern.compile("\\b" + feature.toLowerCase() + "\\b", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(sentence.toLowerCase()); while (matcher.find()) { feature_exists = true; check_features.add(feature); } } if (!feature_exists) { //System.out.println("\n"+sentence); //System.out.println("No feature present!\n"); continue; } //Features present //System.out.println("\nFeatures present\n"); // for(String feature : check_features){ // //System.out.print(feature+" "); // } //get parse tree and construct dependency tree Tree tr = dr.parse(sentence); DependencyTree depTree = dr.getTypedDependencyTree(tr); //get tokenized words //System.out.println("\nTokenized Words\n"); List<Word> word_list = tr.yieldWords(); List<String> word_tokens = new ArrayList(); for (Word word : word_list) { word_tokens.add(word.word()); //System.out.print(word.word()+" "); } String[] words = new String[word_tokens.size()]; words = word_tokens.toArray(words); //System.out.println("\nPOS Tags\n"); List<TaggedWord> postags = tr.taggedYield(); List<String> tag_tokens = new ArrayList(); for (TaggedWord postag : postags) { tag_tokens.add(postag.tag()); System.out.print(postag.tag() + " "); } String[] tags = new String[tag_tokens.size()]; tags = tag_tokens.toArray(tags); //System.out.println("\nBIO Encoding\n"); //BIO encoding for sentence String result[] = chunkerME.chunk(words, tags); for (String r : result) { System.out.print(r + " "); } //System.out.println("\nPhrases\n"); //Outputs spans of BIO-NP HashMap<Integer, Integer> span_map = new HashMap(); Span[] span = chunkerME.chunkAsSpans(words, tags); int j = 0; ArrayList<PhraseSet> pSets = new ArrayList(); for (Span s : span) { ArrayList<String> phrase_words = new ArrayList(); //System.out.print("\n"+s.toString()+" "); int n = 0; for (int i = s.getStart(); i < s.getEnd(); i++) { System.out.print(words[i] + " "); span_map.put(i, j); phrase_words.add(words[i]); n++; } PhraseSet pSet = new PhraseSet(j, s.toString(), phrase_words); pSets.add(pSet); j++; } //RootWord //Actual root is dummy DependencyTreeNode rootNode = depTree.getVertex(0).edges.get(0).target; Queue<DependencyTreeNode> queue = new LinkedList(); rootNode.parent = null; queue.add(rootNode); while (!queue.isEmpty()) { DependencyTreeNode u = queue.remove(); u.pos = tags[u.index - 1]; if (span_map.get(u.index - 1) != null) { u.phrase_index = span_map.get(u.index - 1); } else { u.phrase_index = -1; } //System.out.println("\n"+u.word+"-"+u.phrase_index+"-"+tags[u.index-1]); for (DependencyTreeEdge e : u.edges) { e.target.parent = u; queue.add(e.target); //System.out.print(e.target.word+" "); } } HashMap<String, List<String>> featurePhrases = SentimentExtract.getSentimentPhrases(check_features, pSets, depTree); for (String chk_feat : check_features) { featuresPhrases.get(chk_feat).addAll(featurePhrases.get(chk_feat)); } num_lines++; } System.out.println(num_lines); for (String f : features) { out.print(f + " "); out.print(String.join(" ", featuresPhrases.get(f))); out.println(); } System.out.println("Success"); out.close(); }