Example usage for edu.stanford.nlp.process CoreLabelTokenFactory CoreLabelTokenFactory

List of usage examples for edu.stanford.nlp.process CoreLabelTokenFactory CoreLabelTokenFactory

Introduction

In this page you can find the example usage for edu.stanford.nlp.process CoreLabelTokenFactory CoreLabelTokenFactory.

Prototype

public CoreLabelTokenFactory() 

Source Link

Document

Constructor for a new token factory which will add in the word, the "current" annotation, and the begin/end position annotations.

Usage

From source file:csav2.pkg0.ParserTagging.java

public String dependency(String text) {
    String output = "";
    Annotation document = new Annotation(text);
    try {/*from  www  .  j  a v a  2s. c o m*/
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println("Exception while calling method annotate(Method dependency):" + e);
    }
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        List wordList = tokenizerFactory.getTokenizer(new StringReader(sentence.toString())).tokenize();
        //System.out.println(wordList.toString());
        Tree tree = lp.apply(wordList);
        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection cc = gs.typedDependenciesCCprocessed(true);
        output += cc.toString() + "\n";
    }
    //System.out.println(output);
    return output;
}

From source file:de.andreasschoknecht.LS3.PNMLReader.java

License:Open Source License

/**
 * Creates the term lists for a process model (LS3Document) in a model collection. Adds the terms to the document itself as Bag-of-Words and adds the terms to
 * the HashSet of terms of the document collection. This method is used when parsing a document collection.
 *
 * @param labels The labels contained in the PNML file
 * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document
 * @param documentCollection The DocumentCollection for updating the term list of the whole collection
 * @throws IOException if stop word file could not be read
 */// w w  w.j av  a  2s.  c  o  m
private void createTermLists(List<Object> labels, LS3Document ls3Document,
        DocumentCollection documentCollection) throws IOException {
    initializeWordList();

    ArrayList<String> tokens = new ArrayList<String>();
    String label = "";
    for (Object temp : labels) {
        Element value = (Element) temp;
        label = label + value.getText() + " ";
    }

    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(),
            "untokenizable=allKeep");
    while (ptbt.hasNext()) {
        tokens.add(ptbt.next().value());
    }

    for (int i = 0, j = tokens.size(); i < j; i++) {
        String bereinigt = tokens.get(i).toLowerCase();

        // Clear tokens of empty tokens, stop words, and automatic tool labels
        if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) {
            String term = bereinigt.replaceAll("[0-9]+", "");
            ls3Document.addTerm(stemString(term));
            documentCollection.addTerm(stemString(term));
        }
    }
}

From source file:de.andreasschoknecht.LS3.PNMLReader.java

License:Open Source License

/**
 * Creates the term list for a process model (LS3Document). It only adds the terms to the document itself as Bag-of-Words.
 * This method is used when parsing a query model.
 *
 * @param labels The labels contained in the PNML file
 * @param ls3Document The LS3Document representation of the PNML file for updating the term list of the document
 * @throws IOException if stop word file could not be read
 *//*from   ww  w . j a  v a  2s.co m*/
private void createTermLists(List<Object> labels, LS3Document ls3Document) throws IOException {
    initializeWordList();

    ArrayList<String> tokens = new ArrayList<String>();
    String label = "";
    for (Object temp : labels) {
        Element value = (Element) temp;
        label = label + value.getText() + " ";
    }

    PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new StringReader(label), new CoreLabelTokenFactory(),
            "untokenizable=allKeep");
    while (ptbt.hasNext()) {
        tokens.add(ptbt.next().value());
    }

    for (int i = 0, j = tokens.size(); i < j; i++) {
        String bereinigt = tokens.get(i).toLowerCase();

        // Clear tokens of empty tokens, stop words, and automatic tool labels
        if (!bereinigt.matches("(p|t)*([0-9]+)") && !stopwords.contains(bereinigt) && !bereinigt.equals("")) {
            String term = bereinigt.replaceAll("[0-9]+", "");
            ls3Document.addTerm(stemString(term));
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java

License:Open Source License

@Override
public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException {
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()),
            new CoreLabelTokenFactory(), "invertible");

    for (CoreLabel label : tokenizer.tokenize()) {
        replace(label.beginPosition(), label.endPosition(), label.word());
    }//w w w  .j av a  2 s. c o m
}

From source file:DependencyParser.Parser.java

public void CallParser(String text) // start of the main method

{
    try {/* www.  j a  v a  2s.c  o m*/

        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        LexicalizedParser lp = LexicalizedParser
                .loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
        lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" });
        TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
        Tree tree = lp.apply(wordList);

        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true);
        System.out.println(tdl);

        PrintWriter pw = new PrintWriter("H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\Text-Parsed.txt");
        TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
        tp.printTree(tree, pw);

        pw.close();
        Main.writeImage(tree, tdl, "H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\image.png", 3);
        assert (new File("image.png").exists());
    } catch (FileNotFoundException f) {

    } catch (Exception ex) {
        Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:edu.illinois.cs.cogcomp.pipeline.handlers.StanfordParseHandler.java

License:Open Source License

static List<CoreMap> buildStanfordSentences(TextAnnotation ta) {
    View tokens = ta.getView(ViewNames.TOKENS);
    View sentences = ta.getView(ViewNames.SENTENCE);
    String rawText = ta.getText();

    List<CoreMap> stanfordSentences = new LinkedList<>();
    List<CoreLabel> stanfordTokens = new LinkedList<>();
    int tokIndex = 0;
    int sentIndex = 0;
    Constituent currentSentence = sentences.getConstituents().get(0);
    String sentText = rawText.substring(currentSentence.getStartCharOffset(),
            currentSentence.getEndCharOffset());

    CoreLabelTokenFactory tf = new CoreLabelTokenFactory();

    for (Constituent tok : tokens.getConstituents()) {
        if (tok.getStartSpan() >= currentSentence.getEndSpan()) {
            CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex++,
                    stanfordTokens);/*from w  ww.j  a  va  2  s  .c o m*/
            stanfordSentences.add(stanfordSentence);
            stanfordTokens = new LinkedList<>();
            currentSentence = sentences.getConstituents().get(sentIndex);
            sentText = rawText.substring(currentSentence.getStartCharOffset(),
                    currentSentence.getEndCharOffset());
        }
        int tokStart = tok.getStartCharOffset();
        int tokLength = tok.getEndCharOffset() - tokStart;

        String form = rawText.substring(tokStart, tok.getEndCharOffset());

        CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength);
        stanfordTok.setIndex(tokIndex++);
        stanfordTokens.add(stanfordTok);

    }
    // should be one last sentence
    CoreMap stanfordSentence = buildStanfordSentence(currentSentence, sentText, sentIndex, stanfordTokens);
    stanfordSentences.add(stanfordSentence);
    return stanfordSentences;
}

From source file:Engines.Test.StanfordParser.TreeHandling.java

License:Open Source License

public static void test(String text) {
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" });
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);/* w w  w .j  a va 2  s.  c  om*/
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true);

}

From source file:englishparser.EnglishParser.java

/**
 * demoAPI demonstrates other ways of calling the parser with already
 * tokenized text, or in some cases, raw text that needs to be tokenized as
 * a single sentence. Output is handled with a TreePrint object. Note that
 * the options used when creating the TreePrint can determine what results
 * to print out. Once again, one can capture the output by passing a
 * PrintWriter to TreePrint.printTree./*from   w w w.  j  a v  a  2s  .c o  m*/
 */
public static void demoAPI(LexicalizedParser lp) {
    // This option shows parsing a list of correctly tokenized words
    String[] sent = { "This", "is", "an", "easy", "sentence", "." };
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);
    parse.pennPrint();
    System.out.println();

    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
    List<CoreLabel> rawWords2 = tok.tokenize();
    parse = lp.apply(rawWords2);

    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);
    System.out.println();

    // You can also use a TreePrint object to print trees and dependencies
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    tp.printTree(parse);
}

From source file:flight_ranker.TaggerDemo2.java

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
        return;/*www.  j ava2 s .c  om*/
    }
    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
            "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
        pw.println(Sentence.listToString(tSentence, false));
    }

    // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
    List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",",
            "green", "grass", ".");
    List<TaggedWord> taggedSent = tagger.tagSentence(sent);
    for (TaggedWord tw : taggedSent) {
        if (tw.tag().startsWith("JJ")) {
            pw.println(tw.word());
        }
    }

    pw.close();
}

From source file:gate.stanford.Tokenizer.java

License:Open Source License

@Override
public void execute() throws ExecutionException {
    // check the parameters
    if (document == null)
        throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);

    long startTime = System.currentTimeMillis();
    fireStatusChanged("Tokenising " + document.getName());
    fireProgressChanged(0);/*from   w  w w.  jav  a  2 s .  c om*/

    // tokenising goes here
    String rawText = "";
    try {
        rawText = document.getContent().getContent(new Long(0), document.getContent().size()).toString();
    } catch (Exception e) {
        System.out.println("Document content offsets wrong: " + e);
    }

    PTBTokenizer<CoreLabel> ptbt;
    try {
        ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new CoreLabelTokenFactory(),
                "invertible=true");
    } catch (Exception e) {
        System.out.println("Failed when calling tokenizer: " + e);
        return;
    }

    Long tokenStart;
    Long tokenEnd;
    Long prevTokenEnd = new Long(0); // this default value lets us capture leading spaces

    for (CoreLabel label; ptbt.hasNext();) {
        label = ptbt.next();
        tokenStart = new Long(label.beginPosition());
        tokenEnd = new Long(label.endPosition());

        SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl();

        // add the token annotation
        try {
            tokenMap.put(TOKEN_STRING_FEATURE,
                    document.getContent().getContent(tokenStart, tokenEnd).toString());
            outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap);
        } catch (InvalidOffsetException e) {
            System.out.println("Token alignment problem:" + e);
        }

        // do we need to add a space annotation?
        if (tokenStart > prevTokenEnd) {
            try {
                outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new SimpleFeatureMapImpl());
            } catch (InvalidOffsetException e) {
                System.out.println("Space token alignment problem:" + e);
            }

        }

        prevTokenEnd = tokenEnd;

    }

    fireProcessFinished();
    fireStatusChanged(document.getName() + " tokenised in "
            + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
            + " seconds!");
}