Example usage for edu.stanford.nlp.process DocumentPreprocessor iterator

List of usage examples for edu.stanford.nlp.process DocumentPreprocessor iterator

Introduction

In this page you can find the example usage for edu.stanford.nlp.process DocumentPreprocessor iterator.

Prototype

@Override
public Iterator<List<HasWord>> iterator() 

Source Link

Document

Returns sentences until the document is exhausted.

Usage

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>//from w  w  w.j a  v  a 2 s . c o m
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:com.diskoverorta.osdep.StanfordNLP.java

License:Apache License

public List<String> splitSentencesINDocument(String sDoc) {
    Reader reader = new StringReader(sDoc);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();
    Iterator<List<HasWord>> it = dp.iterator();

    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }//w  w  w  .  j  a  v  a 2 s  .  c o  m
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString().trim());
    }
    return sentenceList;
}

From source file:com.summarizer.Utilities.java

License:Apache License

public static String[] sentenceTokonizer(String entireDoc) {
    Reader reader = new StringReader(entireDoc);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new LinkedList<String>();
    Iterator<List<HasWord>> it = dp.iterator();

    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }//from w w  w . java 2s.  c om
            sentenceSb.append(token.word());
        }
        sentenceList.add(sentenceSb.toString());
    }
    return (String[]) sentenceList.toArray(new String[sentenceList.size()]);
}

From source file:delic.Document.java

/**
 * Stanford NLP library based sentence segmentation
 * //from   www  .ja v a 2  s  .  co  m
 * @return Arraylist of sentences
 */
public Iterator<Sentence> getSentencePOSBased() {
    Reader reader = new StringReader(this.docText);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    ArrayList<Sentence> sentenceList = new ArrayList<Sentence>();
    Iterator<List<HasWord>> it = dp.iterator();
    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() >= 1) {
                sentenceSb.append(" ");
            }
            sentenceSb.append(token);
        }
        String sentenceStr = sentenceSb.toString();
        if (sentenceStr.length() > 20) {
            int parts = sentenceStr.length() / 20;
            String pieces[] = new String[parts];
            int i = 0;
            while (i < parts) {
                pieces[i] = sentenceStr.substring(i * 20, i * 20 + 20);
                i++;
            }
            i = 0;
            while (i < parts) {
                sentenceList.add(new Sentence(pieces[i++]));
            }
        } else {
            sentenceList.add(new Sentence(sentenceStr));
        }
    }
    return sentenceList.iterator();
}

From source file:File.SplitintoSentences.java

public List<String> getSentences(String filetext)

{
    List<String> sentences = new ArrayList<String>();

    Reader reader = new StringReader(filetext);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);

    List<String> sentenceList = new LinkedList<String>();
    Iterator<List<HasWord>> it = dp.iterator();
    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }/*from  w  ww  .j a  v  a2s .  com*/
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString());
    }

    for (String sentence : sentenceList) {
        //  System.out.println(sentence);
        sentences.add(sentence);
    }

    return sentences;

}

From source file:fr.lipn.yasemir.ontology.annotation.SentenceBasedAnnotator.java

License:Open Source License

/**
 * Method that uses DocumentPreprocessor from Stanford Parser to split text into sentences
 * @param text/*  w  w  w .  j  av a  2 s  . co  m*/
 * @return
 */
private Vector<String> getSentences(String text) {
    Vector<String> sentenceList = new Vector<String>();
    DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));

    Iterator<List<HasWord>> it = dp.iterator();
    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString());
    }
    /*
    for(String sentence:sentenceList) {
       System.err.println(sentence);
    }
    */
    return sentenceList;

}

From source file:org.ets.research.nlp.stanford_thrift.parser.StanfordParserThrift.java

License:Open Source License

public List<ParseTree> parse_text(String text, List<String> outputFormat) throws TApplicationException {
    List<ParseTree> results = new ArrayList<ParseTree>();

    try {// www  . j a  va2s  .  c om
        treePrinter = ParserUtil.setOptions(outputFormat, tlp);

        // assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer
        DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(text));
        Iterator<List<HasWord>> foundSentences = preprocess.iterator();
        while (foundSentences.hasNext()) {
            Tree parseTree = parser.apply(foundSentences.next());
            results.add(
                    new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score()));
        }
    } catch (Exception e) {
        // FIXME
        throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
    }

    return results;
}

From source file:org.ets.research.nlp.stanford_thrift.tagger.StanfordTaggerThrift.java

License:Open Source License

public List<List<TaggedToken>> tag_text(String untokenizedText) {
    List<List<TaggedToken>> taggedAndTokenizedSentences = new ArrayList<List<TaggedToken>>();

    // assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer
    DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(untokenizedText));
    Iterator<List<HasWord>> foundSentences = preprocess.iterator();
    while (foundSentences.hasNext()) {
        taggedAndTokenizedSentences.add(tagSingleSentence(foundSentences.next()));
    }//w ww  .  ja  v a 2 s. c  om

    return taggedAndTokenizedSentences;
}

From source file:org.ets.research.nlp.stanford_thrift.tokenizer.StanfordTokenizerThrift.java

License:Open Source License

public List<List<String>> tokenizeText(String arbitraryText) {
    List<List<String>> tokenizedSentences = new ArrayList<List<String>>();

    DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(arbitraryText));
    Iterator<List<HasWord>> foundSentences = preprocess.iterator();
    while (foundSentences.hasNext()) {
        List<HasWord> tokenizedSentence = foundSentences.next();
        List<String> tokenizedSentenceAsListOfStrings = new ArrayList<String>();
        for (HasWord w : tokenizedSentence) {
            tokenizedSentenceAsListOfStrings.add(w.word());
        }/*from   w  w  w  . ja va2 s  . com*/
        tokenizedSentences.add(tokenizedSentenceAsListOfStrings);
    }

    return tokenizedSentences;
}

From source file:parser.StanfordTokenizer.java

public static ArrayList<String> tokenizeSents(String sent) {
    Reader reader = new StringReader(sent);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);

    ArrayList<String> sentenceList = new ArrayList<String>();
    Iterator<List<HasWord>> it = dp.iterator();
    while (it.hasNext()) {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence) {
            if (sentenceSb.length() > 1) {
                sentenceSb.append(" ");
            }//www  . j  a  va  2 s. com
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString());
    }
    return sentenceList;
}