Example usage for edu.stanford.nlp.process PTBTokenizer PTBTokenizer

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBTokenizer PTBTokenizer.

Prototype

public PTBTokenizer(final Reader r, final LexedTokenFactory<T> tokenFactory, final String options)

Source Link

Document

Constructs a new PTBTokenizer with a custom LexedTokenFactory.

Usage

From source file:DependencyParse.java

License:Apache License

public static void main(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(args);
    if (!props.containsKey("tokpath") || !props.containsKey("parentpath") || !props.containsKey("relpath")) {
        System.err.println(// w  ww  . ja  v a2  s.co m
                "usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>");
        System.exit(1);
    }

    boolean tokenize = false;
    if (props.containsKey("tokenize")) {
        tokenize = true;
    }

    String tokPath = props.getProperty("tokpath");
    String parentPath = props.getProperty("parentpath");
    String relPath = props.getProperty("relpath");

    BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath));
    BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath));
    BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath));

    MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL);
    DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL);
    Scanner stdin = new Scanner(System.in);
    int count = 0;
    long start = System.currentTimeMillis();
    while (stdin.hasNextLine()) {
        String line = stdin.nextLine();
        List<HasWord> tokens = new ArrayList<>();
        if (tokenize) {
            PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
            for (Word label; tokenizer.hasNext();) {
                tokens.add(tokenizer.next());
            }
        } else {
            for (String word : line.split(" ")) {
                tokens.add(new Word(word));
            }
        }

        List<TaggedWord> tagged = tagger.tagSentence(tokens);

        int len = tagged.size();
        Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies();
        int[] parents = new int[len];
        for (int i = 0; i < len; i++) {
            // if a node has a parent of -1 at the end of parsing, then the node
            // has no parent.
            parents[i] = -1;
        }

        String[] relns = new String[len];
        for (TypedDependency td : tdl) {
            // let root have index 0
            int child = td.dep().index();
            int parent = td.gov().index();
            relns[child - 1] = td.reln().toString();
            parents[child - 1] = parent;
        }

        // print tokens
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            if (tokenize) {
                sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
            } else {
                sb.append(tokens.get(i).word());
            }
            sb.append(' ');
        }
        if (tokenize) {
            sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
        } else {
            sb.append(tokens.get(len - 1).word());
        }
        sb.append('\n');
        tokWriter.write(sb.toString());

        // print parent pointers
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(parents[i]);
            sb.append(' ');
        }
        sb.append(parents[len - 1]);
        sb.append('\n');
        parentWriter.write(sb.toString());

        // print relations
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(relns[i]);
            sb.append(' ');
        }
        sb.append(relns[len - 1]);
        sb.append('\n');
        relWriter.write(sb.toString());

        count++;
        if (count % 1000 == 0) {
            double elapsed = (System.currentTimeMillis() - start) / 1000.0;
            System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
        }
    }

    long totalTimeMillis = System.currentTimeMillis() - start;
    System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0,
            totalTimeMillis / (double) count);
    tokWriter.close();
    parentWriter.close();
    relWriter.close();
}

From source file:ConstituencyParse.java

License:Apache License

public List<HasWord> sentenceToTokens(String line) {
    List<HasWord> tokens = new ArrayList<>();
    if (tokenize) {
        PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
        for (Word label; tokenizer.hasNext();) {
            tokens.add(tokenizer.next());
        }//from   w  w  w. j av a2s  . c  o  m
    } else {
        for (String word : line.split(" ")) {
            tokens.add(new Word(word));
        }
    }

    return tokens;
}

From source file:de.andreasschoknecht.LS3.PNMLReader.java

License:Open Source License

/**
 * Creates the term lists for a process model (LS3Document) in a model collection. Adds the terms to the document itself as Bag-of-Words and adds the terms to
 * the HashSet of terms of the document collection. This method is used when parsing a document collection.
 *
 * @param labels The labels contained in the PNML file
 * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document
 * @param documentCollection The DocumentCollection for updating the term list of the whole collection
 * @throws IOException if stop word file could not be read
 *//*w  w  w  .ja va2  s .c  om*/
private void createTermLists(List<Object> labels, LS3Document ls3Document,
        DocumentCollection documentCollection) throws IOException {
    initializeWordList();

    ArrayList<String> tokens = new ArrayList<String>();
    String label = "";
    for (Object temp : labels) {
        Element value = (Element) temp;
        label = label + value.getText() + " ";
    }

    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(),
            "untokenizable=allKeep");
    while (ptbt.hasNext()) {
        tokens.add(ptbt.next().value());
    }

    for (int i = 0, j = tokens.size(); i < j; i++) {
        String bereinigt = tokens.get(i).toLowerCase();

        // Clear tokens of empty tokens, stop words, and automatic tool labels
        if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) {
            String term = bereinigt.replaceAll("[0-9]+", "");
            ls3Document.addTerm(stemString(term));
            documentCollection.addTerm(stemString(term));
        }
    }
}

From source file:de.andreasschoknecht.LS3.PNMLReader.java

License:Open Source License

/**
 * Creates the term list for a process model (LS3Document). It only adds the terms to the document itself as Bag-of-Words.
 * This method is used when parsing a query model.
 *
 * @param labels The labels contained in the PNML file
 * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document
 * @throws IOException if stop word file could not be read
 *///  w  ww.j  a  v a  2s.com
private void createTermLists(List<Object> labels, LS3Document ls3Document) throws IOException {
    initializeWordList();

    ArrayList<String> tokens = new ArrayList<String>();
    String label = "";
    for (Object temp : labels) {
        Element value = (Element) temp;
        label = label + value.getText() + " ";
    }

    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(),
            "untokenizable=allKeep");
    while (ptbt.hasNext()) {
        tokens.add(ptbt.next().value());
    }

    for (int i = 0, j = tokens.size(); i < j; i++) {
        String bereinigt = tokens.get(i).toLowerCase();

        // Clear tokens of empty tokens, stop words, and automatic tool labels
        if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) {
            String term = bereinigt.replaceAll("[0-9]+", "");
            ls3Document.addTerm(stemString(term));
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java

License:Open Source License

@Override
public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException {
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()),
            new CoreLabelTokenFactory(), "invertible");

    for (CoreLabel label : tokenizer.tokenize()) {
        replace(label.beginPosition(), label.endPosition(), label.word());
    }/*from www. j  av  a2 s.co m*/
}

From source file:gate.stanford.Tokenizer.java

License:Open Source License

@Override
public void execute() throws ExecutionException {
    // check the parameters
    if (document == null)
        throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);

    long startTime = System.currentTimeMillis();
    fireStatusChanged("Tokenising " + document.getName());
    fireProgressChanged(0);/*from w w  w. j  a v  a2  s .  c o m*/

    // tokenising goes here
    String rawText = "";
    try {
        rawText = document.getContent().getContent(new Long(0), document.getContent().size()).toString();
    } catch (Exception e) {
        System.out.println("Document content offsets wrong: " + e);
    }

    PTBTokenizer<CoreLabel> ptbt;
    try {
        ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new CoreLabelTokenFactory(),
                "invertible=true");
    } catch (Exception e) {
        System.out.println("Failed when calling tokenizer: " + e);
        return;
    }

    Long tokenStart;
    Long tokenEnd;
    Long prevTokenEnd = new Long(0); // this default value lets us capture leading spaces

    for (CoreLabel label; ptbt.hasNext();) {
        label = ptbt.next();
        tokenStart = new Long(label.beginPosition());
        tokenEnd = new Long(label.endPosition());

        SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl();

        // add the token annotation
        try {
            tokenMap.put(TOKEN_STRING_FEATURE,
                    document.getContent().getContent(tokenStart, tokenEnd).toString());
            outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap);
        } catch (InvalidOffsetException e) {
            System.out.println("Token alignment problem:" + e);
        }

        // do we need to add a space annotation?
        if (tokenStart > prevTokenEnd) {
            try {
                outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new SimpleFeatureMapImpl());
            } catch (InvalidOffsetException e) {
                System.out.println("Space token alignment problem:" + e);
            }

        }

        prevTokenEnd = tokenEnd;

    }

    fireProcessFinished();
    fireStatusChanged(document.getName() + " tokenised in "
            + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
            + " seconds!");
}

From source file:gr.aueb.cs.nlp.bioasq.classifiers.Baseline.java

public static ArrayList<String> tokenize(String sentence) {
    ArrayList<String> tokens = new ArrayList<String>();
    PTBTokenizer ptbt = new PTBTokenizer(new StringReader(sentence), new CoreLabelTokenFactory(), "");
    for (CoreLabel label; ptbt.hasNext();) {
        label = (CoreLabel) ptbt.next();
        tokens.add(label.toString());//w  ww.j av  a2s.  c  o  m
    }
    return tokens;
}

From source file:gr.aueb.cs.nlp.similarity.string.utils.DocPreprocessing.java

public static ArrayList<String> tokenizer(String text) {
    ArrayList<String> tokens = new ArrayList<String>();
    PTBTokenizer ptbt = new PTBTokenizer(new StringReader(text), new CoreLabelTokenFactory(), "");
    for (CoreLabel label; ptbt.hasNext();) {
        label = (CoreLabel) ptbt.next();
        tokens.add(label.word());//from  www  .j  a  v a2  s .  co  m
    }
    return tokens;
}

From source file:hmt.hocmay.utility.Ultilities.java

public static ArrayList<String> getWords(String filePath) {
    ArrayList<String> str = new ArrayList<String>();
    PTBTokenizer<CoreLabel> ptbt;/*ww w  .  java2s. co m*/
    try {
        ptbt = new PTBTokenizer<>(new FileReader(filePath), new CoreLabelTokenFactory(),
                "untokenizable=allDelete");
        while (ptbt.hasNext()) {
            CoreLabel label = ptbt.next();
            str.add(label.toString().toLowerCase());
            //                System.out.println(label.toString().toLowerCase());
        }
        //Sp xp t
        Collections.sort(str, new Comparator<String>() {
            public int compare(String str1, String str2) {
                //                System.out.println(str1+" "+str2);
                if (str1.compareTo(str2) > 0) {
                    return 1;
                } else if (str1.compareTo(str2) == 0) {
                    return 0;
                } else {
                    return -1;
                }
            }
        });
        //Loi b? t ging nhau
        for (int i = 0; i < str.size() - 1; i++) {
            if (str.get(i).equals(str.get(i + 1))) {
                str.remove(i);
                i--;
            }
        }
    } catch (FileNotFoundException ex) {

    }
    return str;
}

From source file:org.lambda3.text.simplification.discourse.utils.words.WordsUtils.java

License:Open Source License

public static List<Word> splitIntoWords(String sentence) {
    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(sentence), new CoreLabelTokenFactory(),
            "");/*from ww w  .j ava2s.c  o m*/
    List<Word> words = new ArrayList<>();

    while (ptbt.hasNext()) {
        CoreLabel label = ptbt.next();
        words.add(new Word(label));
    }

    return words;
}