Example usage for edu.stanford.nlp.process WordTokenFactory WordTokenFactory

List of usage examples for edu.stanford.nlp.process WordTokenFactory WordTokenFactory

Introduction

In this page you can find the example usage for edu.stanford.nlp.process WordTokenFactory WordTokenFactory.

Prototype

WordTokenFactory

Source Link

Usage

From source file:DependencyParse.java

License:Apache License

public static void main(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(args);
    if (!props.containsKey("tokpath") || !props.containsKey("parentpath") || !props.containsKey("relpath")) {
        System.err.println(/*from  w  w  w  . j  a va  2 s. com*/
                "usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>");
        System.exit(1);
    }

    boolean tokenize = false;
    if (props.containsKey("tokenize")) {
        tokenize = true;
    }

    String tokPath = props.getProperty("tokpath");
    String parentPath = props.getProperty("parentpath");
    String relPath = props.getProperty("relpath");

    BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath));
    BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath));
    BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath));

    MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL);
    DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL);
    Scanner stdin = new Scanner(System.in);
    int count = 0;
    long start = System.currentTimeMillis();
    while (stdin.hasNextLine()) {
        String line = stdin.nextLine();
        List<HasWord> tokens = new ArrayList<>();
        if (tokenize) {
            PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
            for (Word label; tokenizer.hasNext();) {
                tokens.add(tokenizer.next());
            }
        } else {
            for (String word : line.split(" ")) {
                tokens.add(new Word(word));
            }
        }

        List<TaggedWord> tagged = tagger.tagSentence(tokens);

        int len = tagged.size();
        Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies();
        int[] parents = new int[len];
        for (int i = 0; i < len; i++) {
            // if a node has a parent of -1 at the end of parsing, then the node
            // has no parent.
            parents[i] = -1;
        }

        String[] relns = new String[len];
        for (TypedDependency td : tdl) {
            // let root have index 0
            int child = td.dep().index();
            int parent = td.gov().index();
            relns[child - 1] = td.reln().toString();
            parents[child - 1] = parent;
        }

        // print tokens
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            if (tokenize) {
                sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
            } else {
                sb.append(tokens.get(i).word());
            }
            sb.append(' ');
        }
        if (tokenize) {
            sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
        } else {
            sb.append(tokens.get(len - 1).word());
        }
        sb.append('\n');
        tokWriter.write(sb.toString());

        // print parent pointers
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(parents[i]);
            sb.append(' ');
        }
        sb.append(parents[len - 1]);
        sb.append('\n');
        parentWriter.write(sb.toString());

        // print relations
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(relns[i]);
            sb.append(' ');
        }
        sb.append(relns[len - 1]);
        sb.append('\n');
        relWriter.write(sb.toString());

        count++;
        if (count % 1000 == 0) {
            double elapsed = (System.currentTimeMillis() - start) / 1000.0;
            System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
        }
    }

    long totalTimeMillis = System.currentTimeMillis() - start;
    System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0,
            totalTimeMillis / (double) count);
    tokWriter.close();
    parentWriter.close();
    relWriter.close();
}

From source file:ConstituencyParse.java

License:Apache License

public List<HasWord> sentenceToTokens(String line) {
    List<HasWord> tokens = new ArrayList<>();
    if (tokenize) {
        PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
        for (Word label; tokenizer.hasNext();) {
            tokens.add(tokenizer.next());
        }// w  ww. ja  va2  s  .c o m
    } else {
        for (String word : line.split(" ")) {
            tokens.add(new Word(word));
        }
    }

    return tokens;
}

From source file:qmul.util.treekernel.LexicalSimilarityCalc.java

License:Open Source License

private static float lexSimilarity(String s1, String s2) {
    String[] split1 = s1.split("[.?!]");
    String[] split2 = s2.split("[.?!]");
    Set<String> stemsInFirst = new HashSet<String>();
    Set<String> stemsInSecond = new HashSet<String>();
    for (int i = 0; i < split1.length; i++) {
        PTBTokenizer<Word> tokenizer1 = new PTBTokenizer<Word>(new StringReader(split1[i]),
                new WordTokenFactory(), "tokenizeNLs=false");
        while (tokenizer1.hasNext()) {
            Word w = tokenizer1.next();/*from w  w w  . j  a va 2  s.c  om*/
            String stem = m.stem(w).word();

            stemsInFirst.add(stem);
        }
    }

    for (int j = 0; j < split2.length; j++) {
        PTBTokenizer<Word> tokenizer2 = new PTBTokenizer<Word>(new StringReader(split2[j]),
                new WordTokenFactory(), "tokenizeNLs=false");
        while (tokenizer2.hasNext()) {
            Word w = tokenizer2.next();
            String stem = m.stem(w).word();

            stemsInSecond.add(stem);
        }
    }

    Iterator<String> i = stemsInSecond.iterator();
    float commonStems = 0;
    while (i.hasNext()) {
        String curStem = i.next();
        // System.out.println(curStem);
        if (stemsInFirst.contains(curStem))
            commonStems++;
    }
    int secondSize = stemsInSecond.size();
    if (secondSize > 0)
        return commonStems / (float) (secondSize);
    else
        return 0;
}

From source file:similarity_measures.StringManipulation.java

License:Open Source License

public static ArrayList<String> getTokensListAndReplaceNumbers(String s1) {
    ArrayList<String> tokens = new ArrayList<String>();

    //String[] s1Tokens = s1.split(" ");
    String options = "tokenizeNLs=false,americanize=false,ptb3Escaping=false,normalizeCurrency=false,normalizeFractions=false,normalizeParentheses=false,normalizeOtherBrackets=false,asciiQuotes=false,latexQuotes=false,unicodeQuotes=false,ptb3Ellipsis=false,unicodeEllipsis=false,ptb3Dashes=false,escapeForwardSlashAsterisk=false,untokenizable=noneDelete";
    PTBTokenizer tokenizer = new PTBTokenizer(new StringReader(s1), new WordTokenFactory(), options);

    boolean previousTokenIsNumber = false;
    for (Word label; tokenizer.hasNext();) {
        label = (Word) tokenizer.next();
        //for (String s1Token : s1Tokens) {
        if (!label.value().trim().isEmpty()) {
            if (label.value().replaceAll("\\p{Punct}", "").replaceAll("\\p{Space}", "").trim()
                    .matches("(?=[^A-Za-z]+$).*[0-9].*")) {
                if (!previousTokenIsNumber) {
                    tokens.add("$num$");
                }/*from ww w .  j a  v  a2 s .c o  m*/
                previousTokenIsNumber = true;
            } else {
                previousTokenIsNumber = false;
                tokens.add(label.value().trim());
            }
        }
    }
    return tokens;
}