List of usage examples for edu.stanford.nlp.process WordTokenFactory WordTokenFactory
WordTokenFactory
From source file:DependencyParse.java
License:Apache License
public static void main(String[] args) throws Exception { Properties props = StringUtils.argsToProperties(args); if (!props.containsKey("tokpath") || !props.containsKey("parentpath") || !props.containsKey("relpath")) { System.err.println(/*from w w w . j a va 2 s. com*/ "usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>"); System.exit(1); } boolean tokenize = false; if (props.containsKey("tokenize")) { tokenize = true; } String tokPath = props.getProperty("tokpath"); String parentPath = props.getProperty("parentpath"); String relPath = props.getProperty("relpath"); BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath)); BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath)); BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath)); MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL); DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL); Scanner stdin = new Scanner(System.in); int count = 0; long start = System.currentTimeMillis(); while (stdin.hasNextLine()) { String line = stdin.nextLine(); List<HasWord> tokens = new ArrayList<>(); if (tokenize) { PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); for (Word label; tokenizer.hasNext();) { tokens.add(tokenizer.next()); } } else { for (String word : line.split(" ")) { tokens.add(new Word(word)); } } List<TaggedWord> tagged = tagger.tagSentence(tokens); int len = tagged.size(); Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies(); int[] parents = new int[len]; for (int i = 0; i < len; i++) { // if a node has a parent of -1 at the end of parsing, then the node // has no parent. parents[i] = -1; } String[] relns = new String[len]; for (TypedDependency td : tdl) { // let root have index 0 int child = td.dep().index(); int parent = td.gov().index(); relns[child - 1] = td.reln().toString(); parents[child - 1] = parent; } // print tokens StringBuilder sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { if (tokenize) { sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word())); } else { sb.append(tokens.get(i).word()); } sb.append(' '); } if (tokenize) { sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word())); } else { sb.append(tokens.get(len - 1).word()); } sb.append('\n'); tokWriter.write(sb.toString()); // print parent pointers sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { sb.append(parents[i]); sb.append(' '); } sb.append(parents[len - 1]); sb.append('\n'); parentWriter.write(sb.toString()); // print relations sb = new StringBuilder(); for (int i = 0; i < len - 1; i++) { sb.append(relns[i]); sb.append(' '); } sb.append(relns[len - 1]); sb.append('\n'); relWriter.write(sb.toString()); count++; if (count % 1000 == 0) { double elapsed = (System.currentTimeMillis() - start) / 1000.0; System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed); } } long totalTimeMillis = System.currentTimeMillis() - start; System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count); tokWriter.close(); parentWriter.close(); relWriter.close(); }
From source file:ConstituencyParse.java
License:Apache License
public List<HasWord> sentenceToTokens(String line) { List<HasWord> tokens = new ArrayList<>(); if (tokenize) { PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); for (Word label; tokenizer.hasNext();) { tokens.add(tokenizer.next()); }// w ww. ja va2 s .c o m } else { for (String word : line.split(" ")) { tokens.add(new Word(word)); } } return tokens; }
From source file:qmul.util.treekernel.LexicalSimilarityCalc.java
License:Open Source License
private static float lexSimilarity(String s1, String s2) { String[] split1 = s1.split("[.?!]"); String[] split2 = s2.split("[.?!]"); Set<String> stemsInFirst = new HashSet<String>(); Set<String> stemsInSecond = new HashSet<String>(); for (int i = 0; i < split1.length; i++) { PTBTokenizer<Word> tokenizer1 = new PTBTokenizer<Word>(new StringReader(split1[i]), new WordTokenFactory(), "tokenizeNLs=false"); while (tokenizer1.hasNext()) { Word w = tokenizer1.next();/*from w w w . j a va 2 s.c om*/ String stem = m.stem(w).word(); stemsInFirst.add(stem); } } for (int j = 0; j < split2.length; j++) { PTBTokenizer<Word> tokenizer2 = new PTBTokenizer<Word>(new StringReader(split2[j]), new WordTokenFactory(), "tokenizeNLs=false"); while (tokenizer2.hasNext()) { Word w = tokenizer2.next(); String stem = m.stem(w).word(); stemsInSecond.add(stem); } } Iterator<String> i = stemsInSecond.iterator(); float commonStems = 0; while (i.hasNext()) { String curStem = i.next(); // System.out.println(curStem); if (stemsInFirst.contains(curStem)) commonStems++; } int secondSize = stemsInSecond.size(); if (secondSize > 0) return commonStems / (float) (secondSize); else return 0; }
From source file:similarity_measures.StringManipulation.java
License:Open Source License
public static ArrayList<String> getTokensListAndReplaceNumbers(String s1) { ArrayList<String> tokens = new ArrayList<String>(); //String[] s1Tokens = s1.split(" "); String options = "tokenizeNLs=false,americanize=false,ptb3Escaping=false,normalizeCurrency=false,normalizeFractions=false,normalizeParentheses=false,normalizeOtherBrackets=false,asciiQuotes=false,latexQuotes=false,unicodeQuotes=false,ptb3Ellipsis=false,unicodeEllipsis=false,ptb3Dashes=false,escapeForwardSlashAsterisk=false,untokenizable=noneDelete"; PTBTokenizer tokenizer = new PTBTokenizer(new StringReader(s1), new WordTokenFactory(), options); boolean previousTokenIsNumber = false; for (Word label; tokenizer.hasNext();) { label = (Word) tokenizer.next(); //for (String s1Token : s1Tokens) { if (!label.value().trim().isEmpty()) { if (label.value().replaceAll("\\p{Punct}", "").replaceAll("\\p{Space}", "").trim() .matches("(?=[^A-Za-z]+$).*[0-9].*")) { if (!previousTokenIsNumber) { tokens.add("$num$"); }/*from ww w . j a v a2 s .c o m*/ previousTokenIsNumber = true; } else { previousTokenIsNumber = false; tokens.add(label.value().trim()); } } } return tokens; }