Example usage for edu.stanford.nlp.trees Tree indexSpans

List of usage examples for edu.stanford.nlp.trees Tree indexSpans

Introduction

In this page you can find the example usage for edu.stanford.nlp.trees Tree indexSpans.

Prototype

public void indexSpans() 

Source Link

Document

Index all spans (constituents) in the tree.

Usage

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>/*ww  w . j a v  a2s  . c om*/
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:ConstituencyParse.java

License:Apache License

public int[] constTreeParents(Tree tree) {
    Tree binarized = binarizer.transformTree(tree);
    Tree collapsedUnary = transformer.transformTree(binarized);
    Trees.convertToCoreLabels(collapsedUnary);
    collapsedUnary.indexSpans();
    List<Tree> leaves = collapsedUnary.getLeaves();
    int size = collapsedUnary.size() - leaves.size();
    int[] parents = new int[size];
    HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();

    int idx = leaves.size();
    int leafIdx = 0;
    for (Tree leaf : leaves) {
        Tree cur = leaf.parent(collapsedUnary); // go to preterminal
        int curIdx = leafIdx++;
        boolean done = false;
        while (!done) {
            Tree parent = cur.parent(collapsedUnary);
            if (parent == null) {
                parents[curIdx] = 0;/*  w  ww .  j  a v  a2 s .  c  om*/
                break;
            }

            int parentIdx;
            int parentNumber = parent.nodeNumber(collapsedUnary);
            if (!index.containsKey(parentNumber)) {
                parentIdx = idx++;
                index.put(parentNumber, parentIdx);
            } else {
                parentIdx = index.get(parentNumber);
                done = true;
            }

            parents[curIdx] = parentIdx + 1;
            cur = parent;
            curIdx = parentIdx;
        }
    }

    return parents;
}

From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java

License:Open Source License

public Annotation convert(JCas aSource, Annotation aTarget) {
    // Document annotation
    aTarget.set(CoreAnnotations.TextAnnotation.class, aSource.getDocumentText());

    // Sentences//ww  w  .  ja  va 2s .  c  om
    List<CoreMap> sentences = new ArrayList<>();
    for (Sentence s : select(aSource, Sentence.class)) {
        if (StringUtils.isBlank(s.getCoveredText())) {
            continue;
        }

        String sentenceText = s.getCoveredText();
        if (encoding != null && !"UTF-8".equals(encoding.name())) {
            sentenceText = new String(sentenceText.getBytes(StandardCharsets.UTF_8), encoding);
        }

        Annotation sentence = new Annotation(sentenceText);
        sentence.set(CharacterOffsetBeginAnnotation.class, s.getBegin());
        sentence.set(CharacterOffsetEndAnnotation.class, s.getEnd());
        sentence.set(SentenceIndexAnnotation.class, sentences.size());

        // Tokens
        Map<Token, IndexedWord> idxTokens = new HashMap<>();
        List<CoreLabel> tokens = new ArrayList<>();
        for (Token t : selectCovered(Token.class, s)) {
            String tokenText = t.getCoveredText();
            if (encoding != null && !"UTF-8".equals(encoding.name())) {
                tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding);
            }

            CoreLabel token = tokenFactory.makeToken(tokenText, t.getBegin(), t.getEnd() - t.getBegin());
            // First add token so that tokens.size() returns a 1-based counting as required
            // by IndexAnnotation
            tokens.add(token);
            token.set(SentenceIndexAnnotation.class, sentences.size());
            token.set(IndexAnnotation.class, tokens.size());
            token.set(TokenKey.class, t);
            idxTokens.put(t, new IndexedWord(token));

            // POS tags
            if (readPos && t.getPos() != null) {
                token.set(PartOfSpeechAnnotation.class, t.getPos().getPosValue());
            }

            // Lemma
            if (t.getLemma() != null) {
                token.set(LemmaAnnotation.class, t.getLemma().getValue());
            }

            // Stem
            if (t.getStem() != null) {
                token.set(StemAnnotation.class, t.getStem().getValue());
            }

            // NamedEntity
            // TODO: only token-based NEs are supported, but not multi-token NEs
            // Supporting multi-token NEs via selectCovering would be very slow. To support
            // them, another approach would need to be implemented, e.g. via indexCovering.
            List<NamedEntity> nes = selectCovered(NamedEntity.class, t);
            if (nes.size() > 0) {
                token.set(NamedEntityTagAnnotation.class, nes.get(0).getValue());
            } else {
                token.set(NamedEntityTagAnnotation.class, "O");
            }
        }

        // Constituents
        for (ROOT r : selectCovered(ROOT.class, s)) {
            Tree tree = createStanfordTree(r, idxTokens);
            tree.indexSpans();
            sentence.set(TreeAnnotation.class, tree);
        }

        // Dependencies
        List<TypedDependency> dependencies = new ArrayList<>();
        for (Dependency d : selectCovered(Dependency.class, s)) {
            TypedDependency dep = new TypedDependency(GrammaticalRelation.valueOf(d.getDependencyType()),
                    idxTokens.get(d.getGovernor()), idxTokens.get(d.getDependent()));
            if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) {
                dep.setExtra();
            }
            dependencies.add(dep);
        }
        sentence.set(EnhancedDependenciesAnnotation.class, new SemanticGraph(dependencies));

        if (ptb3Escaping) {
            tokens = applyPtbEscaping(tokens, quoteBegin, quoteEnd);
        }

        sentence.set(TokensAnnotation.class, tokens);
        sentences.add(sentence);
    }
    aTarget.set(SentencesAnnotation.class, sentences);

    return aTarget;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordCoreferenceResolver.java

License:Open Source License

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    modelProvider.configure(aJCas.getCas());

    List<Tree> trees = new ArrayList<Tree>();
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    List<List<CoreLabel>> sentenceTokens = new ArrayList<List<CoreLabel>>();
    for (ROOT root : select(aJCas, ROOT.class)) {
        // Copy all relevant information from the tokens
        List<CoreLabel> tokens = new ArrayList<CoreLabel>();
        for (Token token : selectCovered(Token.class, root)) {
            tokens.add(tokenToWord(token));
        }//  ww  w . j  a  v a2 s  . co m
        sentenceTokens.add(tokens);

        // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace
        // it with PRN to avoid NPEs.
        TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) {
            @Override
            public Tree newTreeNode(String aParent, List<Tree> aChildren) {
                String parent = aParent;
                if ("PRN0".equals(parent)) {
                    parent = "PRN";
                }
                Tree node = super.newTreeNode(parent, aChildren);
                return node;
            }
        };

        // deep copy of the tree. These are modified inside coref!
        Tree treeCopy = TreeUtils.createStanfordTree(root, tFact).treeSkeletonCopy();
        treeCopy.indexSpans();
        trees.add(treeCopy);

        // Build the sentence
        CoreMap sentence = new CoreLabel();
        sentence.set(TreeAnnotation.class, treeCopy);
        sentence.set(TokensAnnotation.class, tokens);
        sentence.set(RootKey.class, root);
        sentences.add(sentence);

        // https://code.google.com/p/dkpro-core-asl/issues/detail?id=590
        // We currently do not copy over dependencies from the CAS. This is supposed to fill
        // in the dependencies so we do not get NPEs.
        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(),
                tlp.typedDependencyHeadFinder());
        ParserAnnotatorUtils.fillInParseAnnotations(false, true, gsf, sentence, treeCopy,
                GrammaticalStructure.Extras.NONE);

        // https://code.google.com/p/dkpro-core-asl/issues/detail?id=582
        SemanticGraph deps = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        for (IndexedWord vertex : deps.vertexSet()) {
            vertex.setWord(vertex.value());
        }

        // merge the new CoreLabels with the tree leaves
        MentionExtractor.mergeLabels(treeCopy, tokens);
        MentionExtractor.initializeUtterance(tokens);
    }

    Annotation document = new Annotation(aJCas.getDocumentText());
    document.set(SentencesAnnotation.class, sentences);

    Coreferencer coref = modelProvider.getResource();

    // extract all possible mentions
    // Reparsing only works when the full CoreNLP pipeline system is set up! Passing false here
    // disables reparsing.
    RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(false);
    List<List<Mention>> allUnprocessedMentions = finder.extractPredictedMentions(document, 0,
            coref.corefSystem.dictionaries());

    // add the relevant info to mentions and order them for coref
    Map<Integer, CorefChain> result;
    try {
        Document doc = coref.mentionExtractor.arrange(document, sentenceTokens, trees, allUnprocessedMentions);
        result = coref.corefSystem.coref(doc);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }

    for (CorefChain chain : result.values()) {
        CoreferenceLink last = null;
        for (CorefMention mention : chain.getMentionsInTextualOrder()) {
            CoreLabel beginLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class)
                    .get(mention.startIndex - 1);
            CoreLabel endLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class)
                    .get(mention.endIndex - 2);
            CoreferenceLink link = new CoreferenceLink(aJCas, beginLabel.get(TokenKey.class).getBegin(),
                    endLabel.get(TokenKey.class).getEnd());

            if (mention.mentionType != null) {
                link.setReferenceType(mention.mentionType.toString());
            }

            if (last == null) {
                // This is the first mention. Here we'll initialize the chain
                CoreferenceChain corefChain = new CoreferenceChain(aJCas);
                corefChain.setFirst(link);
                corefChain.addToIndexes();
            } else {
                // For the other mentions, we'll add them to the chain.
                last.setNext(link);
            }
            last = link;

            link.addToIndexes();
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordDependencyConverter.java

License:Open Source License

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    String lang = language != null ? language : aJCas.getDocumentLanguage();

    if (!languagePacks.containsKey(lang)) {
        throw new AnalysisEngineProcessException(
                new IllegalStateException("Unsupported language [" + aJCas.getDocumentLanguage() + "]"));
    }//from   w w  w  .ja  v a2 s  .  co  m

    TreebankLanguagePack lp;
    try {
        lp = languagePacks.get(aJCas.getDocumentLanguage()).newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
        throw new AnalysisEngineProcessException(e);
    }

    List<CoreMap> sentences = new ArrayList<CoreMap>();
    for (ROOT root : select(aJCas, ROOT.class)) {
        // Copy all relevant information from the tokens
        List<Token> tokens = selectCovered(Token.class, root);
        List<CoreLabel> coreTokens = new ArrayList<CoreLabel>();
        for (Token token : tokens) {
            coreTokens.add(tokenToWord(token));
        }

        // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace
        // it with PRN to avoid NPEs.
        TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) {
            @Override
            public Tree newTreeNode(String aParent, List<Tree> aChildren) {
                String parent = aParent;
                if ("PRN0".equals(parent)) {
                    parent = "PRN";
                }
                Tree node = super.newTreeNode(parent, aChildren);
                return node;
            }
        };

        Tree tree = TreeUtils.createStanfordTree(root, tFact);
        Trees.convertToCoreLabels(tree);
        tree.indexSpans();

        // Build the sentence
        CoreMap sentence = new CoreLabel();
        sentence.set(TreeAnnotation.class, tree);
        sentence.set(TokensAnnotation.class, coreTokens);
        sentence.set(RootKey.class, root);
        sentences.add(sentence);

        doCreateDependencyTags(aJCas, lp, tree, tokens);
    }
}