Example usage for edu.stanford.nlp.process Tokenizer tokenize

List of usage examples for edu.stanford.nlp.process Tokenizer tokenize

Introduction

In this page you can find the example usage for edu.stanford.nlp.process Tokenizer tokenize.

Prototype

List<T> tokenize();

Source Link

Document

Returns all tokens of this Tokenizer as a List for convenience.

Usage

From source file:ErrorCorrection.java

private static List<CoreLabel> tokenize(String str) {
    Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str));
    return tokenizer.tokenize();
}

From source file:artinex.TypDep.java

private List<CoreLabel> tokenize(String str) {
    Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str));
    return tokenizer.tokenize();
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java

License:Open Source License

@Override
public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException {
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()),
            new CoreLabelTokenFactory(), "invertible");

    for (CoreLabel label : tokenizer.tokenize()) {
        replace(label.beginPosition(), label.endPosition(), label.word());
    }//from w w  w .j a  v a  2  s.co m
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java

License:Open Source License

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    List<Token> casTokens = null;

    // Use value from language parameter, document language or fallback language - whatever
    // is available
    String language = getLanguage(aJCas);

    if (isWriteToken()) {
        casTokens = new ArrayList<Token>();
        final String text = aText;
        final Tokenizer<?> tokenizer = getTokenizer(language, aText);
        int offsetInSentence = 0;

        List<?> tokens = tokenizer.tokenize();
        outer: for (int i = 0; i < tokens.size(); i++) {
            final Object token = tokens.get(i);
            // System.out.println("Token class: "+token.getClass());
            String t = null;/*from www  .  ja v  a 2s .c o  m*/
            if (token instanceof String) {
                t = (String) token;
            }
            if (token instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) token;
                t = l.word();
                int begin = l.get(CharacterOffsetBeginAnnotation.class);
                int end = l.get(CharacterOffsetEndAnnotation.class);

                casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i));
                offsetInSentence = end;
                continue;
            }
            if (token instanceof Word) {
                Word w = (Word) token;
                t = w.word();
            }

            if (t == null) {
                throw new AnalysisEngineProcessException(
                        new IllegalStateException("Unknown token type: " + token.getClass()));
            }

            // Skip whitespace
            while (isWhitespace(text.charAt(offsetInSentence))) {
                offsetInSentence++;
                if (offsetInSentence >= text.length()) {
                    break outer;
                }
            }

            // Match
            if (text.startsWith(t, offsetInSentence)) {
                casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence,
                        aZoneBegin + offsetInSentence + t.length(), i));
                offsetInSentence = offsetInSentence + t.length();
            } else {
                //                    System.out.println(aText);
                throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: ["
                        + t + "] CAS: ["
                        + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length()))));
            }
        }
    }

    if (isWriteSentence()) {
        if (casTokens == null) {
            casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length());
        }

        // Prepare the tokens for processing by WordToSentenceProcessor
        List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
        for (Token token : casTokens) {
            CoreLabel l = new CoreLabel();
            l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
            l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
            l.setWord(token.getCoveredText());
            tokensInDocument.add(l);
        }

        // The sentence splitter (probably) requires the escaped text, so we prepare it here
        PTBEscapingProcessor escaper = new PTBEscapingProcessor();
        escaper.apply(tokensInDocument);

        // Apply the WordToSentenceProcessor to find the sentence boundaries
        WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex,
                newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences);

        List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
        for (List<CoreLabel> sentence : sentencesInDocument) {
            int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
            int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class);

            createSentence(aJCas, begin, end);
        }
    }
}

From source file:edu.cmu.cs.in.hoop.hoops.transform.HoopSentence2Tokens.java

License:Open Source License

/**
 *
 *///from   ww  w .  j a  v a  2  s  . co  m
public Boolean runHoop(HoopBase inHoop) {
    String result = "";
    debug("runHoop ()");

    TokenizerFactory<Word> factory = PTBTokenizerFactory.newTokenizerFactory();

    ArrayList<HoopKV> inData = inHoop.getData();

    if (inData != null) {
        HoopSimpleFeatureMaker featureMaker = new HoopSimpleFeatureMaker();

        result = "Number of sentences in input :: " + inData.size();
        for (int i = 0; i < inData.size(); i++) {
            //HoopKVInteger aKV=(HoopKVInteger) inData.get(i);
            HoopKV aKV = inData.get(i);

            HoopKV newKV = createKV(aKV);

            //debug ("Processing item: " + i + " with value: " + aKV.getValueAsString());

            //>------------------------------------------------------------------------

            if (targetTokenizer.getValue().equalsIgnoreCase("SplitOnCharacter") == true) {
                //debug ("Using builtin tokenizer ...");

                List<String> tokens = featureMaker.unigramTokenizeOnCharacter(aKV.getValueAsString(),
                        splitCharacter.getPropValue());

                //debug ("Extracted " + tokens.size());

                if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) {
                    //debug ("Generate mode is Add");

                    //HoopKVInteger newToken=new HoopKVInteger ();

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        //debug ("final input for new token: " + strippedInput);

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        //newToken.setKey (i);
                        //newToken.setValue (strippedInput, j);
                        Integer keyFormatter = i;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(strippedInput, j);
                    }

                    //addKV (newToken);
                    addKV(newKV);
                } else {
                    //debug ("Generate mode is New");

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        //debug ("final input for new token: " + strippedInput);

                        if (this.reKey.getPropValue() == false) {
                            Integer keyFormatter = j;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (j,strippedInput));
                        } else {
                            Integer keyFormatter = i;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (i,strippedInput));
                        }
                    }
                }
            }

            //>------------------------------------------------------------------------

            if (targetTokenizer.getValue().equalsIgnoreCase("RegEx") == true) {
                //debug ("Using builtin tokenizer ...");

                List<String> tokens = featureMaker.unigramTokenizeBasic(aKV.getValueAsString());

                //debug ("Extracted " + tokens.size());

                if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) {
                    //debug ("Generate mode is Add");

                    //HoopKVInteger newToken=new HoopKVInteger ();

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        //debug ("final input for new token: " + strippedInput);

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        Integer keyFormatter = i;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(strippedInput, j);
                    }

                    //addKV (newToken);
                    addKV(newKV);
                } else {
                    //debug ("Generate mode is New");

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        //debug ("final input for new token: " + strippedInput);

                        if (this.reKey.getPropValue() == false) {
                            Integer keyFormatter = j;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (j,strippedInput));
                        } else {
                            Integer keyFormatter = i;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (i,strippedInput));
                        }
                    }
                }
            }

            //>------------------------------------------------------------------------

            if (targetTokenizer.getValue().equalsIgnoreCase("Stanford") == true) {
                //debug ("Using stanford tokenizer ...");

                Tokenizer<Word> tokenizer = factory.getTokenizer(new StringReader(aKV.getValueAsString()));

                List<Word> sTokens = tokenizer.tokenize();

                //debug ("Extracted " + sTokens.size());

                for (int t = 0; t < sTokens.size(); t++) {
                    Word aTerm = sTokens.get(t);

                    if (this.reKey.getPropValue() == false) {
                        Integer keyFormatter = t;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(aTerm.toString());
                        addKV(newKV);
                        //addKV (new HoopKVInteger (j,strippedInput));
                    } else {
                        Integer keyFormatter = i;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(aTerm.toString());
                        addKV(newKV);
                        //addKV (new HoopKVInteger (i,strippedInput));
                    }
                }
            }

            //>------------------------------------------------------------------------            

            updateProgressStatus(i, inData.size());
        }
    } else
        return (false);

    HoopStatisticsPanel statsPanel;
    if (HoopLink.getWindow("Statistics") != null) {
        statsPanel = (HoopStatisticsPanel) HoopLink.getWindow("Statistics");
    } else {
        statsPanel = new HoopStatisticsPanel();
    }
    HoopLink.addView("Statistics", statsPanel, HoopLink.bottom);
    statsPanel.appendString("\n" + result);
    return (true);
}

From source file:edu.uoa.cs.master.cloudmanufacturingnlp.business.nlp.StanfordDependencies.java

License:Apache License

/**
 * Parse the input raw sentence, output the Stanford Dependencies.
 * <p>/*  w  ww.  j a  va  2 s .c  om*/
 * An output example is:shares-3={companyC-2=nsubj}
 * </p>
 * 
 * @param triples
 * @param naturalLanguageRule
 * @return
 */
public String parseNaturalLanguage(Map<String, Map<String, String>> triples, String naturalLanguageRule) {
    String action = null;

    Tokenizer<? extends HasWord> toke = tokenizerFactory.getTokenizer(new StringReader(naturalLanguageRule));
    List<? extends HasWord> sentence = toke.tokenize();

    Tree parse = lp.parse(sentence);
    EnglishGrammaticalStructure gs = (EnglishGrammaticalStructure) gsf.newGrammaticalStructure(parse);

    Collection<TypedDependency> tdl = gs.typedDependencies();

    for (TypedDependency dependency : tdl) {
        String gov = dependency.gov().toString();
        String reln = dependency.reln().toString();
        String dep = dependency.dep().toString();

        if (triples.containsKey(gov)) {
            triples.get(gov).put(dep, reln);
        } else {
            Map<String, String> triple = new HashMap<String, String>();

            triple.put(dep, reln);
            triples.put(gov, triple);
        }

        if (reln.equalsIgnoreCase(Constants.Nlp.NSUBJ)) {
            action = gov;
        }
    }

    return action;
}

From source file:englishparser.EnglishParser.java

/**
 * demoAPI demonstrates other ways of calling the parser with already
 * tokenized text, or in some cases, raw text that needs to be tokenized as
 * a single sentence. Output is handled with a TreePrint object. Note that
 * the options used when creating the TreePrint can determine what results
 * to print out. Once again, one can capture the output by passing a
 * PrintWriter to TreePrint.printTree./*from   w ww  . j  a  v  a2s  . c  om*/
 */
public static void demoAPI(LexicalizedParser lp) {
    // This option shows parsing a list of correctly tokenized words
    String[] sent = { "This", "is", "an", "easy", "sentence", "." };
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);
    parse.pennPrint();
    System.out.println();

    // This option shows loading and using an explicit tokenizer
    String sent2 = "This is another sentence.";
    TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2));
    List<CoreLabel> rawWords2 = tok.tokenize();
    parse = lp.apply(rawWords2);

    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
    System.out.println(tdl);
    System.out.println();

    // You can also use a TreePrint object to print trees and dependencies
    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
    tp.printTree(parse);
}

From source file:ie.pars.aclrdtec.fileutils.GetStatRawTextFile.java

License:Open Source License

public static void main(String[] ss) throws SAXException, ParserConfigurationException, IOException {

    String input = ss[0]; //path to the input folder

    GetFiles gf = new GetFiles();
    gf.getCorpusFiles(input);//from ww  w  . ja  v a 2  s  .c  o m
    List<String> annotationFiles = gf.getFiles();
    System.out.println("There are " + annotationFiles.size() + " files to check!");
    TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory();
    int sentenceNumber = 0;
    int wordSize = 0;
    for (String file : annotationFiles) {
        File f = new File(file);
        Document makeDOM = XMLMethod.makeDOM(file);
        NodeList elementsByTagName = makeDOM.getElementsByTagName("S");
        sentenceNumber += elementsByTagName.getLength();
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            String sentence = elementsByTagName.item(i).getTextContent();
            StringReader sr = new StringReader(sentence);
            Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr);
            List<Word> tokenize = tokenizer.tokenize();
            wordSize += tokenize.size();
        }

    }
    System.out.println(sentenceNumber);
    System.out.println(wordSize);
}

From source file:ie.pars.bnc.preprocess.ProcessNLP.java

License:Open Source License

private static StringBuilder parseTheSentence(String sentence, Morphology morphology, MaxentTagger posTagger,
        ParserGrammar parser, String sid) {
    TokenizerFactory<Word> newTokenizerFactory = PTBTokenizerFactory.newTokenizerFactory();
    //        TokenizerFactory<WordLemmaTag> tokenizerFactory;
    //        TokenizerFactory<CoreLabel> factory = PTBTokenizer.factory(new CoreLabelTokenFactory() , "");
    //        TokenizerFactory<Word> factory1 = PTBTokenizer.factory();

    StringBuilder results = new StringBuilder();
    results.append("<s id='" + sid + "'>\n");

    StringReader sr = new StringReader(sentence);
    Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr);
    List<Word> tokenize = tokenizer.tokenize();

    List<TaggedWord> tagSentence = posTagger.tagSentence(tokenize);

    Tree parseTree = parser.parse(tagSentence);

    GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree,
            parser.treebankLanguagePack().punctuationWordRejectFilter(),
            parser.getTLPParams().typedDependencyHeadFinder());

    Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree();
    SemanticGraph depTree = new SemanticGraph(deps);

    for (int i = 0; i < tagSentence.size(); ++i) {

        int head = -1;
        String deprel = null;// w w  w . j  av  a2s  . c o  m
        //                    if (depTree != null) {
        Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet());
        IndexedWord node = depTree.getNodeByIndexSafe(i + 1);
        if (node != null) {
            List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node);
            if (!edgeList.isEmpty()) {
                assert edgeList.size() == 1;
                head = edgeList.get(0).getGovernor().index();
                deprel = edgeList.get(0).getRelation().toString();
            } else if (rootSet.contains(i + 1)) {
                head = 0;
                deprel = "ROOT";
            }
        }
        //     }

        // Write the token
        TaggedWord lexHead = null;
        if (head > 0) {
            lexHead = tagSentence.get(head - 1);
        }
        results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n");
    }
    results.append("</s>\n");
    return results;
}

From source file:info.atmykitchen.basic_annotation_convert.ConvertToBIO.java

License:Open Source License

private static void convertFile(File file, String annotator, PrintWriter printer)
        throws ParserConfigurationException, IOException, Exception {
    System.out.println(file.getAbsolutePath());
    AnnotationFile annotationFile = IOMethods.loadAnnotationFile(file);
    Map<Integer, List<Annotation>> annotationLstMap = annotationFile.getAnnotationMapSentence();

    TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory();
    String currentLabel = "";
    int previousEnd = 0;
    printer.println("<doc id=\"" + annotationFile.getAclid() + "\" title=\"" + annotationFile.getTitle()
            + "\" annotatorid=\"" + annotator + "\">");
    for (int i = 0; i < annotationFile.getSentences().size(); i++) {

        String sid = (i + 1) + "-" + annotationFile.getAclid();
        printer.println("<s id=\"" + sid + "\" annotatorid=\"" + annotator + "\">");
        String sentence = annotationFile.getSentences().get(i);
        System.out.println(sentence);
        StringReader sr = new StringReader(sentence);
        Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr);
        List<Word> tokenize = tokenizer.tokenize();
        List<TaggedWord> tagSentence = tagger.tagSentence(tokenize);
        List<Annotation> sentenceAnnotationList = new ArrayList<>();
        if (annotationLstMap.containsKey(i)) {
            sentenceAnnotationList = annotationLstMap.get(i);
        }//  ww w .j av  a 2 s  .  c o m
        System.out.println(sentenceAnnotationList.size());
        Collections.sort(sentenceAnnotationList, Annotation.sentnceOrderComp());
        List<Integer> toEnd = new ArrayList();
        for (int j = 0; j < tagSentence.size(); j++) {

            //to add <g/> gap tags
            if (j == 0) {
                previousEnd = tagSentence.get(j).endPosition();
            } else {
                if (previousEnd == tagSentence.get(j).beginPosition()) {
                    printer.println("<g/>");
                }
                previousEnd = tagSentence.get(j).endPosition();
            }
            int startoffset = tagSentence.get(j).beginPosition();

            if (!toEnd.isEmpty()) {
                Collections.sort(toEnd);
                while (!toEnd.isEmpty() && startoffset >= toEnd.get(0)) {
                    currentLabel = "";
                    //System.out.println("** "+toEnd.get(0));
                    printer.println("</term>");
                    toEnd.remove(0);
                }
            }
            // this is based on the fact that currently we do not have nested annotations, 
            // while the inner annotations work assignin labels to them for ske engine is going to be a bit problamatic, the best solution is to use multivalue feature of ske but this is something to be dealt in the future
            if (!sentenceAnnotationList.isEmpty()) {

                while (!sentenceAnnotationList.isEmpty()
                        && sentenceAnnotationList.get(0).getStartOffsetSentence() <= startoffset) {
                    Annotation remove = sentenceAnnotationList.remove(0);
                    toEnd.add(remove.getStartOffsetSentence() + remove.getContent().length());
                    printer.println("<term class=\"" + remove.getType() + "\" id=\"" + j + "-" + sid
                            + "\" annotatorid=\"" + annotator + "\">");
                    currentLabel = remove.getType();

                }
            }

            printer.println(
                    sentence.substring(tagSentence.get(j).beginPosition(), tagSentence.get(j).endPosition())
                            + "\t" + m.lemma(tagSentence.get(j).word(), tagSentence.get(j).tag()) + "\t"
                            + tagSentence.get(j).tag());

        }
        printer.println("</s>");
    }
    printer.println("</doc>");
}