Example usage for edu.stanford.nlp.process TokenizerFactory getTokenizer

Introduction

In this page you can find the example usage for edu.stanford.nlp.process TokenizerFactory getTokenizer.

Prototype

Tokenizer<T> getTokenizer(Reader r);

Source Link

Document

Get a tokenizer for this reader.

Usage

From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java

License:Open Source License

/**
 * Constructor with minimum parameters. It only tokenizes a given String
 * without removing stopwords, name handles etc.
 * @param config A Config object.//w  w w .  j a  v a  2  s  . c om
 * @param text The text to be tokenized.
 */
public Tokenizer(Config config, String text) {
    this.config = config;
    TokenizerFactory<Word> tf = PTBTokenizer.factory();
    List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize();
    for (Word token : tokens) {
        cleanTokens.add(token.toString());
    }
    //        String[] tokens = text.split(" ");
    //        cleanTokens.addAll(Arrays.asList(tokens));
}

From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java

License:Open Source License

/**
 * Public constructor. It tokenizes a given String and separates hashtags,
 * name handles, URLs and stopwords and stores them into different lists.
 * @param config A Config object./*from  w  w  w.j av a 2  s . c  om*/
 * @param text The text to be tokenized.
 * @param sw A StopWords handle.
 */
public Tokenizer(Config config, String text, StopWords sw) {
    TokenizerFactory<Word> tf = PTBTokenizer.factory();

    List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize();
    this.config = config;
    numberOfTokens = tokens.size();
    tokens.stream().map((word) -> word.toString()).forEach((token) -> {

        if (isHashtag(token)) {
            hashtags.add(token);
            cleanTokensAndHashtags.add(token.replace("#", "")); //Remove '#'
        } else if (isNameHandle(token)) {
            nameHandles.add(token.replace("@", "")); //Remove '@' character
        } else if (isURL(token)) {
            urls.add(token);
        } else if (sw.isStopWord(token)) { //Common stopwords
            stopWords.add(token);
        } else if (isCommonSymbol(token)) { //Common symbolsAndNonPrintableChars not caught before
            symbolsAndNonPrintableChars.add(token);
        } else if (sw
                .isNonPrintableCharacter("\\u" + Integer.toHexString(token.toCharArray()[0]).substring(1))) { //Non printable characters
            symbolsAndNonPrintableChars.add(token);
        } else {
            cleanTokens.add(token);
            cleanTokensAndHashtags.add(token);
        }
    });
}

From source file:csav2.pkg0.ParserTagging.java

public String dependency(String text) {
    String output = "";
    Annotation document = new Annotation(text);
    try {// ww  w  .j  av  a 2 s  .c o m
        pipeline.annotate(document);
    } catch (Exception e) {
        System.out.println("Exception while calling method annotate(Method dependency):" + e);
    }
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        List wordList = tokenizerFactory.getTokenizer(new StringReader(sentence.toString())).tokenize();
        //System.out.println(wordList.toString());
        Tree tree = lp.apply(wordList);
        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection cc = gs.typedDependenciesCCprocessed(true);
        output += cc.toString() + "\n";
    }
    //System.out.println(output);
    return output;
}

From source file:DependencyParser.Parser.java

public void CallParser(String text) // start of the main method

{
    try {/*from   w w  w.  j  a  v a2 s.c  om*/

        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        LexicalizedParser lp = LexicalizedParser
                .loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
        lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" });
        TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
        Tree tree = lp.apply(wordList);

        GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
        Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true);
        System.out.println(tdl);

        PrintWriter pw = new PrintWriter("H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\Text-Parsed.txt");
        TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
        tp.printTree(tree, pw);

        pw.close();
        Main.writeImage(tree, tdl, "H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\image.png", 3);
        assert (new File("image.png").exists());
    } catch (FileNotFoundException f) {

    } catch (Exception ex) {
        Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:Engines.Test.StanfordParser.TreeHandling.java

License:Open Source License

public static void test(String text) {
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" });
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);/* ww w  . jav  a  2 s . c  om*/
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true);

}

From source file:englishparser.EnglishParser.java

/**
 * demoAPI demonstrates other ways of calling the parser with already
 * tokenized text, or in some cases, raw text that needs to be tokenized as
 * a single sentence. Output is handled with a TreePrint object. Note that
 * the options used when creating the TreePrint can determine what results
 * to print out. Once again, one can capture the output by passing a
 * PrintWriter to TreePrint.printTree./*from w  w  w  . jav  a2  s  . com*/
 */
public static void demoAPI(LexicalizedParser lp) {
    // This option shows parsing a list of correctly tokenized words
    String[] sent = { "This", "is", "an", "easy", "sentence", "." };
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);
    parse.pennPrint();
    System.out.println();

    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
    List<CoreLabel> rawWords2 = tok.tokenize();
    parse = lp.apply(rawWords2);

    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);
    System.out.println();

    // You can also use a TreePrint object to print trees and dependencies
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    tp.printTree(parse);
}

From source file:ie.pars.aclrdtec.fileutils.GetStatRawTextFile.java

License:Open Source License

public static void main(String[] ss) throws SAXException, ParserConfigurationException, IOException {

    String input = ss[0]; //path to the input folder

    GetFiles gf = new GetFiles();
    gf.getCorpusFiles(input);//  w  ww  .  jav  a2  s.c om
    List<String> annotationFiles = gf.getFiles();
    System.out.println("There are " + annotationFiles.size() + " files to check!");
    TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory();
    int sentenceNumber = 0;
    int wordSize = 0;
    for (String file : annotationFiles) {
        File f = new File(file);
        Document makeDOM = XMLMethod.makeDOM(file);
        NodeList elementsByTagName = makeDOM.getElementsByTagName("S");
        sentenceNumber += elementsByTagName.getLength();
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            String sentence = elementsByTagName.item(i).getTextContent();
            StringReader sr = new StringReader(sentence);
            Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr);
            List<Word> tokenize = tokenizer.tokenize();
            wordSize += tokenize.size();
        }

    }
    System.out.println(sentenceNumber);
    System.out.println(wordSize);
}

From source file:ie.pars.bnc.preprocess.ProcessNLP.java

License:Open Source License

private static StringBuilder parseTheSentence(String sentence, Morphology morphology, MaxentTagger posTagger,
        ParserGrammar parser, String sid) {
    TokenizerFactory<Word> newTokenizerFactory = PTBTokenizerFactory.newTokenizerFactory();
    //        TokenizerFactory<WordLemmaTag> tokenizerFactory;
    //        TokenizerFactory<CoreLabel> factory = PTBTokenizer.factory(new CoreLabelTokenFactory() , "");
    //        TokenizerFactory<Word> factory1 = PTBTokenizer.factory();

    StringBuilder results = new StringBuilder();
    results.append("<s id='" + sid + "'>\n");

    StringReader sr = new StringReader(sentence);
    Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr);
    List<Word> tokenize = tokenizer.tokenize();

    List<TaggedWord> tagSentence = posTagger.tagSentence(tokenize);

    Tree parseTree = parser.parse(tagSentence);

    GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree,
            parser.treebankLanguagePack().punctuationWordRejectFilter(),
            parser.getTLPParams().typedDependencyHeadFinder());

    Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree();
    SemanticGraph depTree = new SemanticGraph(deps);

    for (int i = 0; i < tagSentence.size(); ++i) {

        int head = -1;
        String deprel = null;//from w w w. j a v  a2s .c  o m
        //                    if (depTree != null) {
        Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet());
        IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
        if (node != null) {
            List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
            if (!edgeList.isEmpty()) {
                assert edgeList.size() == 1;
                head = edgeList.get(0).getGovernor().index();
                deprel = edgeList.get(0).getRelation().toString();
            } else if (rootSet.contains(i + 1)) {
                head = 0;
                deprel = "ROOT";
            }
        }
        //     }

        // Write the token
        TaggedWord lexHead = null;
        if (head > 0) {
            lexHead = tagSentence.get(head - 1);
        }
        results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n");
    }
    results.append("</s>\n");
    return results;
}

From source file:info.atmykitchen.basic_annotation_convert.ConvertToBIO.java

License:Open Source License

private static void convertFile(File file, String annotator, PrintWriter printer)
        throws ParserConfigurationException, IOException, Exception {
    System.out.println(file.getAbsolutePath());
    AnnotationFile annotationFile = IOMethods.loadAnnotationFile(file);
    Map<Integer, List<Annotation>> annotationLstMap = annotationFile.getAnnotationMapSentence();

    TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory();
    String currentLabel = "";
    int previousEnd = 0;
    printer.println("<doc id=\"" + annotationFile.getAclid() + "\" title=\"" + annotationFile.getTitle()
            + "\" annotatorid=\"" + annotator + "\">");
    for (int i = 0; i < annotationFile.getSentences().size(); i++) {

        String sid = (i + 1) + "-" + annotationFile.getAclid();
        printer.println("<s id=\"" + sid + "\" annotatorid=\"" + annotator + "\">");
        String sentence = annotationFile.getSentences().get(i);
        System.out.println(sentence);
        StringReader sr = new StringReader(sentence);
        Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr);
        List<Word> tokenize = tokenizer.tokenize();
        List<TaggedWord> tagSentence = tagger.tagSentence(tokenize);
        List<Annotation> sentenceAnnotationList = new ArrayList<>();
        if (annotationLstMap.containsKey(i)) {
            sentenceAnnotationList = annotationLstMap.get(i);
        }/*from w  w  w. ja  va 2s  . c o m*/
        System.out.println(sentenceAnnotationList.size());
        Collections.sort(sentenceAnnotationList, Annotation.sentnceOrderComp());
        List<Integer> toEnd = new ArrayList();
        for (int j = 0; j < tagSentence.size(); j++) {

            //to add <g/> gap tags
            if (j == 0) {
                previousEnd = tagSentence.get(j).endPosition();
            } else {
                if (previousEnd == tagSentence.get(j).beginPosition()) {
                    printer.println("<g/>");
                }
                previousEnd = tagSentence.get(j).endPosition();
            }
            int startoffset = tagSentence.get(j).beginPosition();

            if (!toEnd.isEmpty()) {
                Collections.sort(toEnd);
                while (!toEnd.isEmpty() && startoffset >= toEnd.get(0)) {
                    currentLabel = "";
                    //System.out.println("** "+toEnd.get(0));
                    printer.println("</term>");
                    toEnd.remove(0);
                }
            }
            // this is based on the fact that currently we do not have nested annotations, 
            // while the inner annotations work assignin labels to them for ske engine is going to be a bit problamatic, the best solution is to use multivalue feature of ske but this is something to be dealt in the future
            if (!sentenceAnnotationList.isEmpty()) {

                while (!sentenceAnnotationList.isEmpty()
                        && sentenceAnnotationList.get(0).getStartOffsetSentence() <= startoffset) {
                    Annotation remove = sentenceAnnotationList.remove(0);
                    toEnd.add(remove.getStartOffsetSentence() + remove.getContent().length());
                    printer.println("<term class=\"" + remove.getType() + "\" id=\"" + j + "-" + sid
                            + "\" annotatorid=\"" + annotator + "\">");
                    currentLabel = remove.getType();

                }
            }

            printer.println(
                    sentence.substring(tagSentence.get(j).beginPosition(), tagSentence.get(j).endPosition())
                            + "\t" + m.lemma(tagSentence.get(j).word(), tagSentence.get(j).tag()) + "\t"
                            + tagSentence.get(j).tag());

        }
        printer.println("</s>");
    }
    printer.println("</doc>");
}

From source file:io.anserini.qa.RetrieveSentences.java

License:Apache License

public void getRankedPassages(Args args) throws Exception {
    Map<String, Float> scoredDocs = retrieveDocuments(args);
    Map<String, Float> sentencesMap = new LinkedHashMap<>();

    IndexUtils util = new IndexUtils(args.index);

    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

    for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) {
        List<Sentence> sentences = util.getSentDocument(doc.getKey());

        for (Sentence sent : sentences) {
            List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize();
            String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" "));
            sentencesMap.put(answerTokens, doc.getValue());
        }// ww  w .j ava2  s  .c  om
    }

    String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream()
            .map(CoreLabel::toString).collect(Collectors.joining(" "));
    scorer.score(queryTokens, sentencesMap);

    List<ScoredPassage> topPassages = scorer.extractTopPassages();
    for (ScoredPassage s : topPassages) {
        System.out.println(s.getSentence() + " " + s.getScore());
    }
}