List of usage examples for edu.stanford.nlp.io IOUtils readerFromString
public static BufferedReader readerFromString(String textFileOrUrl) throws IOException
From source file:ilcc.ccgparser.nnparser.IncNNParser.java
private double[][] readEmbedFile(String embedFile, Map<String, Integer> embedID) { double[][] embeddings = null; if (embedFile != null) { BufferedReader input = null; try {/* w ww . ja v a2 s. c o m*/ input = IOUtils.readerFromString(embedFile); List<String> lines = new ArrayList<String>(); for (String s; (s = input.readLine()) != null;) { lines.add(s); } int nWords = lines.size(); String[] splits = lines.get(0).split("\\s+"); int dim = splits.length - 1; embeddings = new double[nWords][dim]; System.err.println("Embedding File " + embedFile + ": #Words = " + nWords + ", dim = " + dim); if (dim != config.embeddingSize) throw new IllegalArgumentException( "The dimension of embedding file does not match config.embeddingSize"); for (int i = 0; i < lines.size(); ++i) { splits = lines.get(i).split("\\s+"); embedID.put(splits[0], i); for (int j = 0; j < dim; ++j) embeddings[i][j] = Double.parseDouble(splits[j + 1]); } } catch (IOException e) { throw new RuntimeIOException(e); } finally { IOUtils.closeIgnoringExceptions(input); } } return embeddings; }
From source file:ilcc.ccgparser.nnparser.IncNNParser.java
private void setupClassifierForTraining(List<CCGJSentence> trainSents, List<CCGJTreeNode> trainTrees, String embedFile, String preModel) throws IOException { double[][] E = new double[knownWords.size() + knownPos.size() + knownCCGCats.size()][config.embeddingSize]; double[][] W1 = new double[config.hiddenSize][config.embeddingSize * config.numTokens]; double[] b1 = new double[config.hiddenSize]; double[][] W2 = new double[actsList.size()][config.hiddenSize]; // Randomly initialize weight matrices / vectors Random random = Util.getRandom(); for (int i = 0; i < W1.length; ++i) for (int j = 0; j < W1[i].length; ++j) W1[i][j] = random.nextDouble() * 2 * config.initRange - config.initRange; for (int i = 0; i < b1.length; ++i) b1[i] = random.nextDouble() * 2 * config.initRange - config.initRange; for (int i = 0; i < W2.length; ++i) for (int j = 0; j < W2[i].length; ++j) W2[i][j] = random.nextDouble() * 2 * config.initRange - config.initRange; // Read embeddings into `embedID`, `embeddings` Map<String, Integer> embedID = new HashMap<String, Integer>(); double[][] embeddings = readEmbedFile(embedFile, embedID); // Try to match loaded embeddings with words in dictionary int foundEmbed = 0; for (int i = 0; i < E.length; ++i) { int index = -1; if (i < knownWords.size()) { String str = knownWords.get(i); //NOTE: exact match first, and then try lower case.. if (embedID.containsKey(str)) index = embedID.get(str); else if (embedID.containsKey(str.toLowerCase())) index = embedID.get(str.toLowerCase()); }//w w w . j ava2s .c o m if (index >= 0) { ++foundEmbed; for (int j = 0; j < E[i].length; ++j) E[i][j] = embeddings[index][j]; } else { for (int j = 0; j < E[i].length; ++j) E[i][j] = random.nextDouble() * 0.02 - 0.01; } } System.err.println("Found embeddings: " + foundEmbed + " / " + knownWords.size()); if (preModel != null) { try { System.err.println("Loading pre-trained model file: " + preModel + " ... "); String s; BufferedReader input = IOUtils.readerFromString(preModel); s = input.readLine(); int nDict = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nPOS = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nLabel = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int eSize = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int hSize = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nTokens = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); String[] splits; for (int k = 0; k < nDict; ++k) { s = input.readLine(); splits = s.split(" "); if (wordIDs.containsKey(splits[0]) && eSize == config.embeddingSize) { int index = getWordID(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); } } for (int k = 0; k < nPOS; ++k) { s = input.readLine(); splits = s.split(" "); if (posIDs.containsKey(splits[0]) && eSize == config.embeddingSize) { int index = getPosID(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); } } for (int k = 0; k < nLabel; ++k) { s = input.readLine(); splits = s.split(" "); if (ccgcatIDs.containsKey(splits[0]) && eSize == config.embeddingSize) { int index = getCCGCatID(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); } } boolean copyLayer1 = hSize == config.hiddenSize && config.embeddingSize == eSize && config.numTokens == nTokens; if (copyLayer1) { System.err.println("Copying parameters W1 && b1..."); } for (int j = 0; j < eSize * nTokens; ++j) { s = input.readLine(); if (copyLayer1) { splits = s.split(" "); for (int i = 0; i < hSize; ++i) W1[i][j] = Double.parseDouble(splits[i]); } } s = input.readLine(); if (copyLayer1) { splits = s.split(" "); for (int i = 0; i < hSize; ++i) b1[i] = Double.parseDouble(splits[i]); } boolean copyLayer2 = (nLabel * 2 - 1 == actsList.size()) && hSize == config.hiddenSize; if (copyLayer2) System.err.println("Copying parameters W2..."); for (int j = 0; j < hSize; ++j) { s = input.readLine(); if (copyLayer2) { splits = s.split(" "); for (int i = 0; i < nLabel * 2 - 1; ++i) W2[i][j] = Double.parseDouble(splits[i]); } } input.close(); } catch (IOException e) { throw new RuntimeIOException(e); } } Dataset trainSet = genTrainExamples(trainSents, trainTrees); classifier = new Classifier(config, trainSet, E, W1, b1, W2, preComputed); }
From source file:ilcc.ccgparser.nnparser.IncNNParser.java
private void loadModelFile(String modelFile, boolean verbose) throws IOException { Timing t = new Timing(); try {/* w w w . j a va 2 s. c o m*/ System.err.println("Loading ccg parser model file: " + modelFile + " ... "); String s; BufferedReader input = IOUtils.readerFromString(modelFile); s = input.readLine(); int nDict = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nPOS = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nccgCat = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int eSize = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int hSize = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nTokens = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nPreComputed = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int classes = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nuRules = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nbRules = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nrRules = Integer.parseInt(s.substring(s.indexOf('=') + 1)); actsMap = new HashMap<>(); knownWords = new ArrayList<>(); knownPos = new ArrayList<>(); knownCCGCats = new ArrayList<>(); srparser = new NonInc(); double[][] E = new double[nDict + nPOS + nccgCat][eSize]; String[] splits; int index = 0; for (int k = 0; k < classes; k++) { s = input.readLine().trim(); splits = s.split("--"); actsMap.put(ArcJAction.make(SRAction.valueOf(splits[0]), Integer.parseInt(splits[1]), splits[2], RuleType.valueOf(splits[3])), k); } for (int k = 0; k < nuRules; k++) { s = input.readLine().trim(); splits = s.split(" "); String key = splits[0]; for (int i = 1; i < splits.length; i++) { String[] parts = splits[i].split("--"); CCGJRuleInfo info = new CCGJRuleInfo(CCGcat.ccgCatFromString(parts[0]), null, CCGcat.ccgCatFromString(parts[2]), parts[3].equals("true"), RuleType.valueOf(parts[4]), Integer.parseInt(parts[5]), 0); srparser.treebankRules.addUnaryRuleInfo(info, key); } } for (int k = 0; k < nbRules; k++) { s = input.readLine().trim(); splits = s.split(" "); String key = splits[0]; for (int i = 1; i < splits.length; i++) { String[] parts = splits[i].split("--"); CCGJRuleInfo info = new CCGJRuleInfo(CCGcat.ccgCatFromString(parts[0]), CCGcat.ccgCatFromString(parts[1]), CCGcat.ccgCatFromString(parts[2]), parts[3].equals("true"), RuleType.valueOf(parts[4]), Integer.parseInt(parts[5]), 0); srparser.treebankRules.addBinaryRuleInfo(info, key); } } for (int k = 0; k < nrRules; k++) { s = input.readLine().trim(); splits = s.split(" "); String key = splits[0]; for (int i = 1; i < splits.length; i++) { String[] parts = splits[i].split("--"); CCGJRuleInfo info = new CCGJRuleInfo(CCGcat.ccgCatFromString(parts[0]), CCGcat.ccgCatFromString(parts[1]), CCGcat.ccgCatFromString(parts[2]), parts[3].equals("true"), RuleType.valueOf(parts[4]), Integer.parseInt(parts[5]), 0); srparser.treebankRules.addRevealRuleInfo(info, key); } } for (int k = 0; k < nDict; ++k) { s = input.readLine(); splits = s.split(" "); knownWords.add(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); index = index + 1; } for (int k = 0; k < nPOS; ++k) { s = input.readLine(); splits = s.split(" "); knownPos.add(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); index = index + 1; } for (int k = 0; k < nccgCat; ++k) { s = input.readLine(); splits = s.split(" "); knownCCGCats.add(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); index = index + 1; } generateIDs(); double[][] W1 = new double[hSize][eSize * nTokens]; for (int j = 0; j < W1[0].length; ++j) { s = input.readLine(); splits = s.split(" "); for (int i = 0; i < W1.length; ++i) W1[i][j] = Double.parseDouble(splits[i]); } double[] b1 = new double[hSize]; s = input.readLine(); splits = s.split(" "); for (int i = 0; i < b1.length; ++i) b1[i] = Double.parseDouble(splits[i]); double[][] W2 = new double[classes][hSize]; for (int j = 0; j < W2[0].length; ++j) { s = input.readLine(); splits = s.split(" "); for (int i = 0; i < W2.length; ++i) W2[i][j] = Double.parseDouble(splits[i]); } preComputed = new ArrayList<Integer>(); while (preComputed.size() < nPreComputed) { s = input.readLine(); splits = s.split(" "); for (String split : splits) { preComputed.add(Integer.parseInt(split)); } } input.close(); classifier = new Classifier(config, E, W1, b1, W2, preComputed); } catch (IOException e) { throw new RuntimeIOException(e); } // initialize the loaded parser // Pre-compute matrix multiplications if (config.numPreComputed > 0) { classifier.preCompute(); } t.done("Initializing ccg parser"); }
From source file:ilcc.ccgparser.test.IncExtractProb.java
private void loadModelFile(String modelFile, boolean verbose) throws IOException { Timing t = new Timing(); try {// ww w .ja v a2 s . c om System.err.println("Loading ccg parser model file: " + modelFile + " ... "); String s; BufferedReader input = IOUtils.readerFromString(modelFile); s = input.readLine(); int nDict = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nPOS = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nccgCat = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int eSize = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int hSize = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nTokens = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nPreComputed = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int classes = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nuRules = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nbRules = Integer.parseInt(s.substring(s.indexOf('=') + 1)); s = input.readLine(); int nrRules = Integer.parseInt(s.substring(s.indexOf('=') + 1)); actsMap = new HashMap<>(); knownWords = new ArrayList<>(); knownPos = new ArrayList<>(); knownCCGCats = new ArrayList<>(); double[][] E = new double[nDict + nPOS + nccgCat][eSize]; String[] splits; int index = 0; for (int k = 0; k < classes; k++) { s = input.readLine().trim(); splits = s.split("--"); actsMap.put(ArcJAction.make(SRAction.valueOf(splits[0]), Integer.parseInt(splits[1]), splits[2], RuleType.valueOf(splits[3])), k); } for (int k = 0; k < nuRules; k++) { s = input.readLine().trim(); splits = s.split(" "); String key = splits[0]; for (int i = 1; i < splits.length; i++) { String[] parts = splits[i].split("--"); CCGJRuleInfo info = new CCGJRuleInfo(CCGcat.ccgCatFromString(parts[0]), null, CCGcat.ccgCatFromString(parts[2]), parts[3].equals("true"), RuleType.valueOf(parts[4]), Integer.parseInt(parts[5]), 0); srparser.treebankRules.addUnaryRuleInfo(info, key); } } for (int k = 0; k < nbRules; k++) { s = input.readLine().trim(); splits = s.split(" "); String key = splits[0]; for (int i = 1; i < splits.length; i++) { String[] parts = splits[i].split("--"); CCGJRuleInfo info = new CCGJRuleInfo(CCGcat.ccgCatFromString(parts[0]), CCGcat.ccgCatFromString(parts[1]), CCGcat.ccgCatFromString(parts[2]), parts[3].equals("true"), RuleType.valueOf(parts[4]), Integer.parseInt(parts[5]), 0); srparser.treebankRules.addBinaryRuleInfo(info, key); } } for (int k = 0; k < nrRules; k++) { s = input.readLine().trim(); splits = s.split(" "); String key = splits[0]; for (int i = 1; i < splits.length; i++) { String[] parts = splits[i].split("--"); CCGJRuleInfo info = new CCGJRuleInfo(CCGcat.ccgCatFromString(parts[0]), CCGcat.ccgCatFromString(parts[1]), CCGcat.ccgCatFromString(parts[2]), parts[3].equals("true"), RuleType.valueOf(parts[4]), Integer.parseInt(parts[5]), 0); srparser.treebankRules.addRevealRuleInfo(info, key); } } for (int k = 0; k < nDict; ++k) { s = input.readLine(); splits = s.split(" "); knownWords.add(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); index = index + 1; } for (int k = 0; k < nPOS; ++k) { s = input.readLine(); splits = s.split(" "); knownPos.add(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); index = index + 1; } for (int k = 0; k < nccgCat; ++k) { s = input.readLine(); splits = s.split(" "); knownCCGCats.add(splits[0]); for (int i = 0; i < eSize; ++i) E[index][i] = Double.parseDouble(splits[i + 1]); index = index + 1; } generateIDs(); double[][] W1 = new double[hSize][eSize * nTokens]; for (int j = 0; j < W1[0].length; ++j) { s = input.readLine(); splits = s.split(" "); for (int i = 0; i < W1.length; ++i) W1[i][j] = Double.parseDouble(splits[i]); } double[] b1 = new double[hSize]; s = input.readLine(); splits = s.split(" "); for (int i = 0; i < b1.length; ++i) b1[i] = Double.parseDouble(splits[i]); double[][] W2 = new double[classes][hSize]; for (int j = 0; j < W2[0].length; ++j) { s = input.readLine(); splits = s.split(" "); for (int i = 0; i < W2.length; ++i) W2[i][j] = Double.parseDouble(splits[i]); } preComputed = new ArrayList<Integer>(); while (preComputed.size() < nPreComputed) { s = input.readLine(); splits = s.split(" "); for (String split : splits) { preComputed.add(Integer.parseInt(split)); } } input.close(); classifier = new Classifier(config, E, W1, b1, W2, preComputed); } catch (IOException e) { throw new RuntimeIOException(e); } // initialize the loaded parser // Pre-compute matrix multiplications if (config.numPreComputed > 0) { classifier.preCompute(); } t.done("Initializing ccg parser"); }