List of usage examples for edu.stanford.nlp.util Triple first
T1 first
To view the source code for edu.stanford.nlp.util Triple first.
Click Source Link
From source file:edu.stanford.muse.index.NEROld.java
License:Apache License
/** triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive). * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String) *///from www .j av a 2 s .c o m private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets( String documentText, boolean locationsOnly, boolean orgsOnly, Map<String, Integer> locationCounts) { if (documentText.indexOf("\u00A0") > 0) documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages. // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER? if (REMOVE_I18N_CHARS) documentText = cleanI18NChars(documentText); final List<Pair<String, String>> tokensList = new ArrayList<Pair<String, String>>(); /* this does NER word by word, we prefer phrases, so use characterOffsets instead List<List<CoreLabel>> out = classifier.classify(documentText); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { String x = word.get(AnswerAnnotation.class); allTypes.add(x); if (x.equals("PERSON") || x.equals("ORGANIZATION") || x.equals("LOCATION")) { tokensList.add(word.word()); System.out.toString(word.word() + '/' + word.get(AnswerAnnotation.class) + ' '); } } System.out.println(); } */ try { NER.initialize(); } catch (Exception e) { Util.print_exception(e, log); } documentText = getSafeText(documentText); List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(documentText); for (Triple<String, Integer, Integer> t : triples) { String type = t.first(); if (type == null) type = "UNKNOWN"; // we see type = null sometimes #!@#$ allTypes.add(type); if (type.equals("PERSON") || type.equals("ORGANIZATION") || type.equals("LOCATION")) { String token = documentText.substring(t.second(), t.third()); // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already. if (token.toLowerCase().startsWith("hi ")) token = token.substring("hi ".length()).trim(); if (token.toLowerCase().startsWith("dear ")) token = token.substring("dear ".length()).trim(); if (token.length() > MAX_NAME_LENGTH) // drop it continue; if (locationsOnly) { if (type.equals("LOCATION")) { if (locations.containsKey(token.toLowerCase())) tokensList.add(new Pair<String, String>(token, type)); } } else if (orgsOnly) { if (type.equals("ORGANIZATION")) tokensList.add(new Pair<String, String>(token, type)); } else { tokensList.add(new Pair<String, String>(token, type)); if (locationCounts != null && type.equals("LOCATION")) { Integer I = locationCounts.get(token.toLowerCase()); locationCounts.put(token.toLowerCase(), (I == null) ? 1 : I + 1); } } } // System.out.println (t.first() + " : [" + t.second() + ":" + t.third() + "] " + documentText.substring(t.second(), t.third())); } return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(tokensList), triples); }
From source file:edu.usc.polar.CoreNLP.java
public static void StanfordCoreNLP(String doc, String args[]) { try {/*from w w w. ja va2 s . c o m*/ String text; AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); if (args.length > 0) { serializedClassifier = args[0]; } if (args.length > 1) { String fileContents = IOUtils.slurpFile(args[1]); List<List<CoreLabel>> out = classifier.classify(fileContents); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } out = classifier.classifyFile(args[1]); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } } else { InputStream stream = new FileInputStream(doc); //ParsingExample.class.getResourceAsStream(doc) ; // System.out.println(stream.toString()); parser.parse(stream, handler, metadata); // return handler.toString(); text = handler.toString(); String metaValue = metadata.toString(); // System.out.println("Desc:: "+metadata.get("description")); String[] example = new String[1]; example[0] = text; String name = doc.replace("C:\\Users\\Snehal\\Documents\\TREC-Data\\Data", "polar.usc.edu") .replace("\\", "."); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); JSONObject jsonObj = new JSONObject(); jsonObj.put("DOI", name); jsonObj.put("metadata", metaValue.replaceAll("\\s\\s+|\n|\t", " ")); JSONArray tempArray = new JSONArray(); JSONObject tempObj = new JSONObject(); for (Triple<String, Integer, Integer> item : list) { // String jsonOut="{ DOI:"+name+" ," // + ""+item.first() + "\": \"" + text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t"," ")+"\"" // + "\"metadata\":\""+metaValue+"\"" // + "}"; // System.out.println(jsonOut); tempObj.put(item.first(), text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t", " ")); } tempArray.add(tempObj); jsonObj.put("NER", tempArray); jsonArray.add(jsonObj); } // System.out.println("---"); } catch (Exception e) { System.out.println("ERROR : CoreNLP" + "|File Name" + doc.replaceAll("C:\\Users\\Snehal\\Documents\\TREC-Data", "") + " direct" + e.toString()); } }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<String> getPersons(String text) { List<String> persons = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(PERSON) == 0) { persons.add(text.substring(item.second(), item.third())); }//from w w w. j a v a2 s . c o m } return persons; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<String> getLocations(String text) { List<String> locations = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(LOCATION) == 0) { locations.add(text.substring(item.second(), item.third())); }//from www. jav a2 s .c om } return locations; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<String> getOrganizations(String text) { List<String> organizations = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(ORGANIZATION) == 0) { organizations.add(text.substring(item.second(), item.third())); }/*www. j av a 2 s . com*/ } return organizations; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public Map<String, List<String>> getAll(String text) { Map<String, List<String>> result = new HashMap<>(); result.put(PERSON, new LinkedList<String>()); result.put(ORGANIZATION, new LinkedList<String>()); result.put(LOCATION, new LinkedList<String>()); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(LOCATION) == 0) { result.get(LOCATION).add(text.substring(item.second(), item.third())); } else if (item.first().compareTo(ORGANIZATION) == 0) { result.get(ORGANIZATION).add(text.substring(item.second(), item.third())); } else if (item.first().compareTo(PERSON) == 0) { result.get(PERSON).add(text.substring(item.second(), item.third())); }/* w w w. j ava 2 s .c om*/ } return result; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<Triple<String, Integer, Integer>> getPersonMarkers(String text) { List<Triple<String, Integer, Integer>> personsOnlyList = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(PERSON) == 0) { String name = text.substring(item.second(), item.third()); personsOnlyList.add(new Triple(name, item.second(), item.third())); }//ww w. j av a 2s . c o m } return personsOnlyList; }
From source file:fire.NERDemo.java
public static void main(String[] args) throws Exception { String serializedClassifier = "C:\\Users\\DIPANAKR\\Desktop\\Satanu\\fire\\stanford-ner-2015-04-20\\stanford-ner-2015-04-20\\classifiers\\english.all.3class.distsim.crf.ser.gz"; if (args.length > 0) { serializedClassifier = args[0];//from w ww .ja v a 2s . co m } AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); /* For either a file to annotate or for the hardcoded text example, this demo file shows several ways to process the input, for teaching purposes. */ if (args.length > 1) { /* For the file, it shows (1) how to run NER on a String, (2) how to get the entities in the String with character offsets, and (3) how to run NER on a whole file (without loading it into a String). */ String fileContents = IOUtils.slurpFile(args[1]); List<List<CoreLabel>> out = classifier.classify(fileContents); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); out = classifier.classifyFile(args[1]); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents); for (Triple<String, Integer, Integer> item : list) { System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third())); } System.out.println("---"); System.out.println("Ten best entity labelings"); DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter(); classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter); System.out.println("---"); System.out.println("Per-token marginalized probabilities"); classifier.printProbs(args[1], readerAndWriter); // -- This code prints out the first order (token pair) clique probabilities. // -- But that output is a bit overwhelming, so we leave it commented out by default. // System.out.println("---"); // System.out.println("First Order Clique Probabilities"); // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter); } else { /* For the hard-coded String, it shows how to run it on a single sentence, and how to do this and produce several formats, including slash tags and an inline XML output format. It also shows the full contents of the {@code CoreLabel}s that are constructed by the classifier. And it shows getting out the probabilities of different assignments and an n-best list of classifications with probabilities. */ String[] example = { "Good afternoon Rajat Raina, how are you today?", "I go to school at Stanford University, which is located in California." }; for (String str : example) { System.out.println(classifier.classifyToString(str)); } System.out.println("---"); for (String str : example) { // This one puts in spaces and newlines between tokens, so just print not println. System.out.print(classifier.classifyToString(str, "slashTags", false)); } System.out.println("---"); for (String str : example) { // This one is best for dealing with the output as a TSV (tab-separated column) file. // The first column gives entities, the second their classes, and the third the remaining text in a document System.out.print(classifier.classifyToString(str, "tabbedEntities", false)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyWithInlineXML(str)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyToString(str, "xml", true)); } System.out.println("---"); for (String str : example) { System.out.print(classifier.classifyToString(str, "tsv", false)); } System.out.println("---"); // This gets out entities with character offsets int j = 0; for (String str : example) { j++; List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str); for (Triple<String, Integer, Integer> trip : triples) { System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n", trip.first(), trip.second(), trip.third, j); } } System.out.println("---"); // This prints out all the details of what is stored for each token int i = 0; for (String str : example) { for (List<CoreLabel> lcl : classifier.classify(str)) { for (CoreLabel cl : lcl) { System.out.print(i++ + ": "); System.out.println(cl.toShorterString()); } } } System.out.println("---"); } }
From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnescorer.StanfordNlpNeScorerNodeModel.java
License:Open Source License
/** * {@inheritDoc}//from w ww.j ava 2 s . co m */ @Override protected PortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec) throws Exception { m_inputModelPortObject = (StanfordNERModelPortObject) inObjects[1]; m_inputModel = m_inputModelPortObject.getNERModel(); m_usedDict = m_inputModelPortObject.getDictSet(); m_tag = m_inputModelPortObject.getTag(); m_tokenizerName = m_inputModelPortObject.getTokenizerName(); //create a BufferedDataContainer for the scoring values BufferedDataContainer accTable = exec.createDataContainer(new DataTableSpec(QUALITY_MEASURES_SPECS)); // build pattern set from dictionary DataTableSpec docTableSpec = (DataTableSpec) inObjects[0].getSpec(); BufferedDataTable docDataInput = (BufferedDataTable) inObjects[0]; Set<Pattern> knownEntitiesPatternSet = new LinkedHashSet<Pattern>(); for (String word : m_usedDict) { knownEntitiesPatternSet.add(Pattern.compile(word)); } // create dictionary tagger to tag the input documents with the dictionary used for building the model MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, m_tag, true, m_tokenizerName); // create UUID to add them to the file path to avoid cases where two instances of the node model used the same file path at the same time String tempDir = KNIMEConstants.getKNIMETempDir() + "/"; String m_annotatedTestFilePath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv"; // create the annotated test file File m_annotatedTestFile = new File(m_annotatedTestFilePath); PrintWriter sentenceFileWriter = new PrintWriter(m_annotatedTestFile, "UTF-8"); int missingValueCounter = 0; // tag documents and transform sentences to strings while tagged terms get StanfordNLP annotation // iterate through columns for (int i = 0; i < docTableSpec.getNumColumns(); i++) { // iterate through rows if column with correct name has been found if (docTableSpec.getColumnSpec(i).getName().equals(m_docColumnModel.getStringValue())) { int counter = 0; Set<String> countMultiWordTerms = new HashSet<String>(); for (DataRow row : docDataInput) { //set progress bar counter++; double progress = (counter / (double) docDataInput.size()) / (3.0); exec.setProgress(progress, "Preparing documents for validation"); exec.checkCanceled(); if (!row.getCell(i).isMissing() && row.getCell(i).getType().isCompatible(DocumentValue.class)) { Document doc = ((DocumentValue) row.getCell(i)).getDocument(); Document taggedDoc = tagger.tag(doc); Iterator<Sentence> si = taggedDoc.sentenceIterator(); while (si.hasNext()) { Sentence s = si.next(); List<Term> termList = s.getTerms(); Iterator<Term> ti = termList.iterator(); while (ti.hasNext()) { Term t = ti.next(); String termText = t.getText(); String termTextWithWsSuffix = t.getTextWithWsSuffix(); if (m_usedDict.contains(termText) || m_usedDict.contains(termTextWithWsSuffix)) { if (t.getWords().size() > 1) { // multi-word terms should not be written in one line in the training file countMultiWordTerms.add(t.getText()); // so skip it by splitting the term and writing each word in one line for (Word w : t.getWords()) { sentenceFileWriter.println(w.getText() + "\t" + m_tag.getTagValue()); } } else { sentenceFileWriter.println(termText + "\t" + m_tag.getTagValue()); } } else if (!m_usedDict.contains(termText) || !m_usedDict.contains(termTextWithWsSuffix)) { sentenceFileWriter.println(termText + "\tO"); } } } } else { missingValueCounter++; } } } } if (missingValueCounter == 1) { setWarningMessage(missingValueCounter + " row has been ignored due to missing value."); } else if (missingValueCounter > 1) { setWarningMessage(missingValueCounter + " rows have been ignored due to missing values."); } sentenceFileWriter.close(); exec.setProgress(0.5, "Validate model"); // create logger configuration and catch the scores which will be printed to the log file File tmpLogFile = new File(KNIMEConstants.getKNIMETempDir() + "/scores.log"); RedwoodConfiguration conf = RedwoodConfiguration.empty(); conf.handlers(Handlers.chain(Handlers.hideDebug, Handlers.file(tmpLogFile))).apply(); // classify the documents with our model DocumentReaderAndWriter<CoreLabel> raw = m_inputModel.makeReaderAndWriter(); Triple<Double, Double, Double> prfScores = m_inputModel.classifyAndWriteAnswers(m_annotatedTestFilePath, new ByteArrayOutputStream(), raw, true); DataRow stats = new DefaultRow(new RowKey("Row0"), new DataCell[] { DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell() }); ReversedLinesFileReader logReader = new ReversedLinesFileReader(tmpLogFile, StandardCharsets.UTF_8); try { // get values from output stream String[] scores = logReader.readLine().split("\t"); if (scores.length >= 7) { Double precision = prfScores.first() / 100; Double recall = prfScores.second() / 100; Double f1 = prfScores.third() / 100; int tp = Integer.parseInt(scores[4].trim()); int fp = Integer.parseInt(scores[5].trim()); int fn = Integer.parseInt(scores[6].trim()); // create the scores row and add it to the BufferedDataContainer we created in the beginning stats = new DefaultRow(new RowKey("Row0"), new DataCell[] { new DoubleCell(precision), new DoubleCell(recall), new DoubleCell(f1), new IntCell(tp), new IntCell(fp), new IntCell(fn) }); if (tp == 0 && fp == 0 && fn == 0 && precision == 0 && recall == 1 && f1 == 0) { setWarningMessage("Could not parse quality measures of model validation."); } } } catch (NumberFormatException e) { setWarningMessage("Could not parse quality measures of model validation."); } finally { logReader.close(); tmpLogFile.delete(); m_annotatedTestFile.delete(); } accTable.addRowToTable(stats); accTable.close(); return new BufferedDataTable[] { accTable.getTable() }; }
From source file:org.wso2.toolbox.nlp.CountryFunctionExecutor.java
License:Open Source License
public String recognizeEntity(String locationStr) { String countryCode = null;/*w ww . ja va 2 s . c o m*/ if (locationStr != null) { String locationUpperStr = locationStr.toUpperCase(); List<Triple<String, Integer, Integer>> items = countryCodeTable.getClassifier() .classifyToCharacterOffsets(locationUpperStr); for (Triple<String, Integer, Integer> item : items) { if ("Location".equalsIgnoreCase(item.first())) { String countryName = locationUpperStr.substring(item.second, item.third); if (countryCodeTable.getCountryCodeList().contains(countryName)) { countryCode = countryName; } else { countryCode = (String) countryCodeTable.getCountryCode(countryName); } } if (countryCode != null) { break; } } } if (countryCode != null) { return countryCode; } else { return ""; } }