List of usage examples for edu.stanford.nlp.trees Tree indexSpans
public void indexSpans()
From source file:BuildBinarizedDataset.java
/** * Turns a text file into trees for use in a RNTN classifier such as * the treebank used in the Sentiment project. * <br>/*ww w . j a v a2s . c om*/ * The expected input file is one sentence per line, with sentences * separated by blank lines. The first line has the main label of the sentence together with the full sentence. * Lines after the first sentence line but before * the blank line will be treated as labeled sub-phrases. The * labels should start with the label and then contain a list of * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! * For example: * <br> * <code> * 1 Today is not a good day.<br> * 3 good<br> * 3 good day <br> * 3 a good day <br> * <br> * (next block starts here) <br> * </code> * By default the englishPCFG parser is used. This can be changed * with the <code>-parserModel</code> flag. Specify an input file * with <code>-input</code>. * <br> * If a sentiment model is provided with -sentimentModel, that model * will be used to prelabel the sentences. Any spans with given * labels will then be used to adjust those labels. */ public static void main(String[] arg) throws IOException { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true); String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String args[] = { "-input", "D:\\parse.txt", "-sentimentModel", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" }; String inputPath = "D:\\dataset\\good.txt"; String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"; SentimentModel sentimentModel = null; /* for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { System.err.println("Unknown argument " + args[argIndex]); System.exit(2); } }*/ if (inputPath == null) { throw new IllegalArgumentException("Must specify input file with -input"); } LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.loadSerialized(sentimentModelPath); } String text = IOUtils.slurpFileNoExceptions(inputPath); String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk for (String chunk : chunks) { if (chunk.trim().isEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. String[] lines = chunk.trim().split("\\n"); String sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.setSentenceFinalPuncWords(new String[] { "\n" }); List<HasWord> tokens = document.iterator().next(); Integer mainLabel = new Integer(tokens.get(0).word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.subList(1, tokens.size()); //System.err.println(tokens); Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap(); for (int i = 1; i < lines.length; ++i) { extractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.apply(tokens); Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Trees.convertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.forwardPropagateTree(collapsedUnary); setPredictedLabels(collapsedUnary); } else { setUnknownLabels(collapsedUnary, mainLabel); //collapsedUnary.label().setValue(mainLabel.toString()); //System.out.println("Root"+collapsedUnary.getNodeNumber(1)); } Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) { setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue()); } String x = collapsedUnary.toString(); //x.replaceAll("\\s",""); x = x.replace("(", "["); x = x.replace(")", "]"); //writer.write(x); //writer.write("\r\n"); System.out.println(x); //System.out.println(); } //writer.close(); }
From source file:ConstituencyParse.java
License:Apache License
public int[] constTreeParents(Tree tree) { Tree binarized = binarizer.transformTree(tree); Tree collapsedUnary = transformer.transformTree(binarized); Trees.convertToCoreLabels(collapsedUnary); collapsedUnary.indexSpans(); List<Tree> leaves = collapsedUnary.getLeaves(); int size = collapsedUnary.size() - leaves.size(); int[] parents = new int[size]; HashMap<Integer, Integer> index = new HashMap<Integer, Integer>(); int idx = leaves.size(); int leafIdx = 0; for (Tree leaf : leaves) { Tree cur = leaf.parent(collapsedUnary); // go to preterminal int curIdx = leafIdx++; boolean done = false; while (!done) { Tree parent = cur.parent(collapsedUnary); if (parent == null) { parents[curIdx] = 0;/* w ww . j a v a2 s . c om*/ break; } int parentIdx; int parentNumber = parent.nodeNumber(collapsedUnary); if (!index.containsKey(parentNumber)) { parentIdx = idx++; index.put(parentNumber, parentIdx); } else { parentIdx = index.get(parentNumber); done = true; } parents[curIdx] = parentIdx + 1; cur = parent; curIdx = parentIdx; } } return parents; }
From source file:de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp.java
License:Open Source License
public Annotation convert(JCas aSource, Annotation aTarget) { // Document annotation aTarget.set(CoreAnnotations.TextAnnotation.class, aSource.getDocumentText()); // Sentences//ww w . ja va 2s . c om List<CoreMap> sentences = new ArrayList<>(); for (Sentence s : select(aSource, Sentence.class)) { if (StringUtils.isBlank(s.getCoveredText())) { continue; } String sentenceText = s.getCoveredText(); if (encoding != null && !"UTF-8".equals(encoding.name())) { sentenceText = new String(sentenceText.getBytes(StandardCharsets.UTF_8), encoding); } Annotation sentence = new Annotation(sentenceText); sentence.set(CharacterOffsetBeginAnnotation.class, s.getBegin()); sentence.set(CharacterOffsetEndAnnotation.class, s.getEnd()); sentence.set(SentenceIndexAnnotation.class, sentences.size()); // Tokens Map<Token, IndexedWord> idxTokens = new HashMap<>(); List<CoreLabel> tokens = new ArrayList<>(); for (Token t : selectCovered(Token.class, s)) { String tokenText = t.getCoveredText(); if (encoding != null && !"UTF-8".equals(encoding.name())) { tokenText = new String(tokenText.getBytes(StandardCharsets.UTF_8), encoding); } CoreLabel token = tokenFactory.makeToken(tokenText, t.getBegin(), t.getEnd() - t.getBegin()); // First add token so that tokens.size() returns a 1-based counting as required // by IndexAnnotation tokens.add(token); token.set(SentenceIndexAnnotation.class, sentences.size()); token.set(IndexAnnotation.class, tokens.size()); token.set(TokenKey.class, t); idxTokens.put(t, new IndexedWord(token)); // POS tags if (readPos && t.getPos() != null) { token.set(PartOfSpeechAnnotation.class, t.getPos().getPosValue()); } // Lemma if (t.getLemma() != null) { token.set(LemmaAnnotation.class, t.getLemma().getValue()); } // Stem if (t.getStem() != null) { token.set(StemAnnotation.class, t.getStem().getValue()); } // NamedEntity // TODO: only token-based NEs are supported, but not multi-token NEs // Supporting multi-token NEs via selectCovering would be very slow. To support // them, another approach would need to be implemented, e.g. via indexCovering. List<NamedEntity> nes = selectCovered(NamedEntity.class, t); if (nes.size() > 0) { token.set(NamedEntityTagAnnotation.class, nes.get(0).getValue()); } else { token.set(NamedEntityTagAnnotation.class, "O"); } } // Constituents for (ROOT r : selectCovered(ROOT.class, s)) { Tree tree = createStanfordTree(r, idxTokens); tree.indexSpans(); sentence.set(TreeAnnotation.class, tree); } // Dependencies List<TypedDependency> dependencies = new ArrayList<>(); for (Dependency d : selectCovered(Dependency.class, s)) { TypedDependency dep = new TypedDependency(GrammaticalRelation.valueOf(d.getDependencyType()), idxTokens.get(d.getGovernor()), idxTokens.get(d.getDependent())); if (DependencyFlavor.ENHANCED.equals(d.getFlavor())) { dep.setExtra(); } dependencies.add(dep); } sentence.set(EnhancedDependenciesAnnotation.class, new SemanticGraph(dependencies)); if (ptb3Escaping) { tokens = applyPtbEscaping(tokens, quoteBegin, quoteEnd); } sentence.set(TokensAnnotation.class, tokens); sentences.add(sentence); } aTarget.set(SentencesAnnotation.class, sentences); return aTarget; }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordCoreferenceResolver.java
License:Open Source License
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { modelProvider.configure(aJCas.getCas()); List<Tree> trees = new ArrayList<Tree>(); List<CoreMap> sentences = new ArrayList<CoreMap>(); List<List<CoreLabel>> sentenceTokens = new ArrayList<List<CoreLabel>>(); for (ROOT root : select(aJCas, ROOT.class)) { // Copy all relevant information from the tokens List<CoreLabel> tokens = new ArrayList<CoreLabel>(); for (Token token : selectCovered(Token.class, root)) { tokens.add(tokenToWord(token)); }// ww w . j a v a2 s . co m sentenceTokens.add(tokens); // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace // it with PRN to avoid NPEs. TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) { @Override public Tree newTreeNode(String aParent, List<Tree> aChildren) { String parent = aParent; if ("PRN0".equals(parent)) { parent = "PRN"; } Tree node = super.newTreeNode(parent, aChildren); return node; } }; // deep copy of the tree. These are modified inside coref! Tree treeCopy = TreeUtils.createStanfordTree(root, tFact).treeSkeletonCopy(); treeCopy.indexSpans(); trees.add(treeCopy); // Build the sentence CoreMap sentence = new CoreLabel(); sentence.set(TreeAnnotation.class, treeCopy); sentence.set(TokensAnnotation.class, tokens); sentence.set(RootKey.class, root); sentences.add(sentence); // https://code.google.com/p/dkpro-core-asl/issues/detail?id=590 // We currently do not copy over dependencies from the CAS. This is supposed to fill // in the dependencies so we do not get NPEs. TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), tlp.typedDependencyHeadFinder()); ParserAnnotatorUtils.fillInParseAnnotations(false, true, gsf, sentence, treeCopy, GrammaticalStructure.Extras.NONE); // https://code.google.com/p/dkpro-core-asl/issues/detail?id=582 SemanticGraph deps = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); for (IndexedWord vertex : deps.vertexSet()) { vertex.setWord(vertex.value()); } // merge the new CoreLabels with the tree leaves MentionExtractor.mergeLabels(treeCopy, tokens); MentionExtractor.initializeUtterance(tokens); } Annotation document = new Annotation(aJCas.getDocumentText()); document.set(SentencesAnnotation.class, sentences); Coreferencer coref = modelProvider.getResource(); // extract all possible mentions // Reparsing only works when the full CoreNLP pipeline system is set up! Passing false here // disables reparsing. RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(false); List<List<Mention>> allUnprocessedMentions = finder.extractPredictedMentions(document, 0, coref.corefSystem.dictionaries()); // add the relevant info to mentions and order them for coref Map<Integer, CorefChain> result; try { Document doc = coref.mentionExtractor.arrange(document, sentenceTokens, trees, allUnprocessedMentions); result = coref.corefSystem.coref(doc); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } for (CorefChain chain : result.values()) { CoreferenceLink last = null; for (CorefMention mention : chain.getMentionsInTextualOrder()) { CoreLabel beginLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class) .get(mention.startIndex - 1); CoreLabel endLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class) .get(mention.endIndex - 2); CoreferenceLink link = new CoreferenceLink(aJCas, beginLabel.get(TokenKey.class).getBegin(), endLabel.get(TokenKey.class).getEnd()); if (mention.mentionType != null) { link.setReferenceType(mention.mentionType.toString()); } if (last == null) { // This is the first mention. Here we'll initialize the chain CoreferenceChain corefChain = new CoreferenceChain(aJCas); corefChain.setFirst(link); corefChain.addToIndexes(); } else { // For the other mentions, we'll add them to the chain. last.setNext(link); } last = link; link.addToIndexes(); } } }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordDependencyConverter.java
License:Open Source License
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String lang = language != null ? language : aJCas.getDocumentLanguage(); if (!languagePacks.containsKey(lang)) { throw new AnalysisEngineProcessException( new IllegalStateException("Unsupported language [" + aJCas.getDocumentLanguage() + "]")); }//from w w w .ja v a2 s . co m TreebankLanguagePack lp; try { lp = languagePacks.get(aJCas.getDocumentLanguage()).newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new AnalysisEngineProcessException(e); } List<CoreMap> sentences = new ArrayList<CoreMap>(); for (ROOT root : select(aJCas, ROOT.class)) { // Copy all relevant information from the tokens List<Token> tokens = selectCovered(Token.class, root); List<CoreLabel> coreTokens = new ArrayList<CoreLabel>(); for (Token token : tokens) { coreTokens.add(tokenToWord(token)); } // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace // it with PRN to avoid NPEs. TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) { @Override public Tree newTreeNode(String aParent, List<Tree> aChildren) { String parent = aParent; if ("PRN0".equals(parent)) { parent = "PRN"; } Tree node = super.newTreeNode(parent, aChildren); return node; } }; Tree tree = TreeUtils.createStanfordTree(root, tFact); Trees.convertToCoreLabels(tree); tree.indexSpans(); // Build the sentence CoreMap sentence = new CoreLabel(); sentence.set(TreeAnnotation.class, tree); sentence.set(TokensAnnotation.class, coreTokens); sentence.set(RootKey.class, root); sentences.add(sentence); doCreateDependencyTags(aJCas, lp, tree, tokens); } }