Example usage for edu.stanford.nlp.process PTBTokenizer PTBTokenizer

List of usage examples for edu.stanford.nlp.process PTBTokenizer PTBTokenizer

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBTokenizer PTBTokenizer.

Prototype

public PTBTokenizer(final Reader r, final LexedTokenFactory<T> tokenFactory, final String options) 

Source Link

Document

Constructs a new PTBTokenizer with a custom LexedTokenFactory.

Usage

From source file:DependencyParse.java

License:Apache License

public static void main(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(args);
    if (!props.containsKey("tokpath") || !props.containsKey("parentpath") || !props.containsKey("relpath")) {
        System.err.println(// w  ww  . ja  v a2  s.co m
                "usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>");
        System.exit(1);
    }

    boolean tokenize = false;
    if (props.containsKey("tokenize")) {
        tokenize = true;
    }

    String tokPath = props.getProperty("tokpath");
    String parentPath = props.getProperty("parentpath");
    String relPath = props.getProperty("relpath");

    BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath));
    BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath));
    BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath));

    MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL);
    DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL);
    Scanner stdin = new Scanner(System.in);
    int count = 0;
    long start = System.currentTimeMillis();
    while (stdin.hasNextLine()) {
        String line = stdin.nextLine();
        List<HasWord> tokens = new ArrayList<>();
        if (tokenize) {
            PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
            for (Word label; tokenizer.hasNext();) {
                tokens.add(tokenizer.next());
            }
        } else {
            for (String word : line.split(" ")) {
                tokens.add(new Word(word));
            }
        }

        List<TaggedWord> tagged = tagger.tagSentence(tokens);

        int len = tagged.size();
        Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies();
        int[] parents = new int[len];
        for (int i = 0; i < len; i++) {
            // if a node has a parent of -1 at the end of parsing, then the node
            // has no parent.
            parents[i] = -1;
        }

        String[] relns = new String[len];
        for (TypedDependency td : tdl) {
            // let root have index 0
            int child = td.dep().index();
            int parent = td.gov().index();
            relns[child - 1] = td.reln().toString();
            parents[child - 1] = parent;
        }

        // print tokens
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            if (tokenize) {
                sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
            } else {
                sb.append(tokens.get(i).word());
            }
            sb.append(' ');
        }
        if (tokenize) {
            sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
        } else {
            sb.append(tokens.get(len - 1).word());
        }
        sb.append('\n');
        tokWriter.write(sb.toString());

        // print parent pointers
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(parents[i]);
            sb.append(' ');
        }
        sb.append(parents[len - 1]);
        sb.append('\n');
        parentWriter.write(sb.toString());

        // print relations
        sb = new StringBuilder();
        for (int i = 0; i < len - 1; i++) {
            sb.append(relns[i]);
            sb.append(' ');
        }
        sb.append(relns[len - 1]);
        sb.append('\n');
        relWriter.write(sb.toString());

        count++;
        if (count % 1000 == 0) {
            double elapsed = (System.currentTimeMillis() - start) / 1000.0;
            System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
        }
    }

    long totalTimeMillis = System.currentTimeMillis() - start;
    System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", count, totalTimeMillis / 1000.0,
            totalTimeMillis / (double) count);
    tokWriter.close();
    parentWriter.close();
    relWriter.close();
}

From source file:ConstituencyParse.java

License:Apache License

public List<HasWord> sentenceToTokens(String line) {
    List<HasWord> tokens = new ArrayList<>();
    if (tokenize) {
        PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
        for (Word label; tokenizer.hasNext();) {
            tokens.add(tokenizer.next());
        }//from   w  w  w. j av a2s  . c  o  m
    } else {
        for (String word : line.split(" ")) {
            tokens.add(new Word(word));
        }
    }

    return tokens;
}

From source file:de.andreasschoknecht.LS3.PNMLReader.java

License:Open Source License

/**
 * Creates the term lists for a process model (LS3Document) in a model collection. Adds the terms to the document itself as Bag-of-Words and adds the terms to
 * the HashSet of terms of the document collection. This method is used when parsing a document collection.
 *
 * @param labels The labels contained in the PNML file
 * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document
 * @param documentCollection The DocumentCollection for updating the term list of the whole collection
 * @throws IOException if stop word file could not be read
 *//*w  w  w  .ja va2  s .c  om*/
private void createTermLists(List<Object> labels, LS3Document ls3Document,
        DocumentCollection documentCollection) throws IOException {
    initializeWordList();

    ArrayList<String> tokens = new ArrayList<String>();
    String label = "";
    for (Object temp : labels) {
        Element value = (Element) temp;
        label = label + value.getText() + " ";
    }

    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(),
            "untokenizable=allKeep");
    while (ptbt.hasNext()) {
        tokens.add(ptbt.next().value());
    }

    for (int i = 0, j = tokens.size(); i < j; i++) {
        String bereinigt = tokens.get(i).toLowerCase();

        // Clear tokens of empty tokens, stop words, and automatic tool labels
        if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) {
            String term = bereinigt.replaceAll("[0-9]+", "");
            ls3Document.addTerm(stemString(term));
            documentCollection.addTerm(stemString(term));
        }
    }
}

From source file:de.andreasschoknecht.LS3.PNMLReader.java

License:Open Source License

/**
 * Creates the term list for a process model (LS3Document). It only adds the terms to the document itself as Bag-of-Words.
 * This method is used when parsing a query model.
 *
 * @param labels The labels contained in the PNML file
 * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document
 * @throws IOException if stop word file could not be read
 *///  w  ww.j  a  v a  2s.com
private void createTermLists(List<Object> labels, LS3Document ls3Document) throws IOException {
    initializeWordList();

    ArrayList<String> tokens = new ArrayList<String>();
    String label = "";
    for (Object temp : labels) {
        Element value = (Element) temp;
        label = label + value.getText() + " ";
    }

    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(),
            "untokenizable=allKeep");
    while (ptbt.hasNext()) {
        tokens.add(ptbt.next().value());
    }

    for (int i = 0, j = tokens.size(); i < j; i++) {
        String bereinigt = tokens.get(i).toLowerCase();

        // Clear tokens of empty tokens, stop words, and automatic tool labels
        if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) {
            String term = bereinigt.replaceAll("[0-9]+", "");
            ls3Document.addTerm(stemString(term));
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java

License:Open Source License

@Override
public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException {
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()),
            new CoreLabelTokenFactory(), "invertible");

    for (CoreLabel label : tokenizer.tokenize()) {
        replace(label.beginPosition(), label.endPosition(), label.word());
    }/*from www. j  av  a2 s.co m*/
}

From source file:gate.stanford.Tokenizer.java

License:Open Source License

@Override
public void execute() throws ExecutionException {
    // check the parameters
    if (document == null)
        throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);

    long startTime = System.currentTimeMillis();
    fireStatusChanged("Tokenising " + document.getName());
    fireProgressChanged(0);/*from w w  w. j  a v  a2  s .  c o m*/

    // tokenising goes here
    String rawText = "";
    try {
        rawText = document.getContent().getContent(new Long(0), document.getContent().size()).toString();
    } catch (Exception e) {
        System.out.println("Document content offsets wrong: " + e);
    }

    PTBTokenizer<CoreLabel> ptbt;
    try {
        ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new CoreLabelTokenFactory(),
                "invertible=true");
    } catch (Exception e) {
        System.out.println("Failed when calling tokenizer: " + e);
        return;
    }

    Long tokenStart;
    Long tokenEnd;
    Long prevTokenEnd = new Long(0); // this default value lets us capture leading spaces

    for (CoreLabel label; ptbt.hasNext();) {
        label = ptbt.next();
        tokenStart = new Long(label.beginPosition());
        tokenEnd = new Long(label.endPosition());

        SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl();

        // add the token annotation
        try {
            tokenMap.put(TOKEN_STRING_FEATURE,
                    document.getContent().getContent(tokenStart, tokenEnd).toString());
            outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap);
        } catch (InvalidOffsetException e) {
            System.out.println("Token alignment problem:" + e);
        }

        // do we need to add a space annotation?
        if (tokenStart > prevTokenEnd) {
            try {
                outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new SimpleFeatureMapImpl());
            } catch (InvalidOffsetException e) {
                System.out.println("Space token alignment problem:" + e);
            }

        }

        prevTokenEnd = tokenEnd;

    }

    fireProcessFinished();
    fireStatusChanged(document.getName() + " tokenised in "
            + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
            + " seconds!");
}

From source file:gr.aueb.cs.nlp.bioasq.classifiers.Baseline.java

public static ArrayList<String> tokenize(String sentence) {
    ArrayList<String> tokens = new ArrayList<String>();
    PTBTokenizer ptbt = new PTBTokenizer(new StringReader(sentence), new CoreLabelTokenFactory(), "");
    for (CoreLabel label; ptbt.hasNext();) {
        label = (CoreLabel) ptbt.next();
        tokens.add(label.toString());//w  ww.j av  a2s.  c  o  m
    }
    return tokens;
}

From source file:gr.aueb.cs.nlp.similarity.string.utils.DocPreprocessing.java

public static ArrayList<String> tokenizer(String text) {
    ArrayList<String> tokens = new ArrayList<String>();
    PTBTokenizer ptbt = new PTBTokenizer(new StringReader(text), new CoreLabelTokenFactory(), "");
    for (CoreLabel label; ptbt.hasNext();) {
        label = (CoreLabel) ptbt.next();
        tokens.add(label.word());//from  www  .j  a  v a2  s .  co  m
    }
    return tokens;
}

From source file:hmt.hocmay.utility.Ultilities.java

public static ArrayList<String> getWords(String filePath) {
    ArrayList<String> str = new ArrayList<String>();
    PTBTokenizer<CoreLabel> ptbt;/*ww w  .  java2s. co m*/
    try {
        ptbt = new PTBTokenizer<>(new FileReader(filePath), new CoreLabelTokenFactory(),
                "untokenizable=allDelete");
        while (ptbt.hasNext()) {
            CoreLabel label = ptbt.next();
            str.add(label.toString().toLowerCase());
            //                System.out.println(label.toString().toLowerCase());
        }
        //Sp xp t
        Collections.sort(str, new Comparator<String>() {
            public int compare(String str1, String str2) {
                //                System.out.println(str1+" "+str2);
                if (str1.compareTo(str2) > 0) {
                    return 1;
                } else if (str1.compareTo(str2) == 0) {
                    return 0;
                } else {
                    return -1;
                }
            }
        });
        //Loi b? t ging nhau
        for (int i = 0; i < str.size() - 1; i++) {
            if (str.get(i).equals(str.get(i + 1))) {
                str.remove(i);
                i--;
            }
        }
    } catch (FileNotFoundException ex) {

    }
    return str;
}

From source file:org.lambda3.text.simplification.discourse.utils.words.WordsUtils.java

License:Open Source License

public static List<Word> splitIntoWords(String sentence) {
    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(sentence), new CoreLabelTokenFactory(),
            "");/*from ww w  .j ava2s.c  o m*/
    List<Word> words = new ArrayList<>();

    while (ptbt.hasNext()) {
        CoreLabel label = ptbt.next();
        words.add(new Word(label));
    }

    return words;
}