List of usage examples for edu.stanford.nlp.util Triple third
T3 third
To view the source code for edu.stanford.nlp.util Triple third.
Click Source Link
From source file:com.bericotech.clavin.nerd.StanfordExtractor.java
License:Open Source License
/** * Get extracted locations from a plain-text body. * /*from w w w . java 2 s . co m*/ * @param text Text content to perform extraction on. * @return List of Location Occurrences. */ public List<LocationOccurrence> extractLocationNames(String text) { if (text == null) throw new IllegalArgumentException("text input to extractLocationNames should not be null"); List<LocationOccurrence> extractedLocations = new ArrayList<LocationOccurrence>(); // extract entities as <Entity Type, Start Index, Stop Index> List<Triple<String, Integer, Integer>> extractedEntities = namedEntityRecognizer .classifyToCharacterOffsets(text); if (extractedEntities != null) { // iterate over each entity Triple for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) { // check if the entity is a "Location" if (extractedEntity.first.equalsIgnoreCase("LOCATION")) { // build a LocationOccurrence object LocationOccurrence location = new LocationOccurrence( text.substring(extractedEntity.second(), extractedEntity.third()), extractedEntity.second()); // filter out demonyms if (!demonyms.contains(location.text)) // add it to the list of extracted locations extractedLocations.add(location); } } } return extractedLocations; }
From source file:com.geocode.service.impl.CustomExtractor.java
License:Open Source License
/** * Get extracted locations from a plain-text body. * /* ww w. jav a 2s . c o m*/ * @param text Text content to perform extraction on. * @return List of Location Occurrences. */ public List<LocationOccurrence> extractLocationNames(String text) { if (text == null) throw new IllegalArgumentException("text input to extractLocationNames should not be null"); List<LocationOccurrence> extractedLocations = new ArrayList<LocationOccurrence>(); // extract entities as <Entity Type, Start Index, Stop Index> List<Triple<String, Integer, Integer>> extractedEntities = namedEntityRecognizer .classifyToCharacterOffsets(text); if (extractedEntities != null) { // iterate over each entity Triple for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) { // check if the entity is a "Location" // build a LocationOccurrence object LocationOccurrence location = new LocationOccurrence( text.substring(extractedEntity.second(), extractedEntity.third()), extractedEntity.second()); // filter out demonyms if (!demonyms.contains(location.text)) // add it to the list of extracted locations extractedLocations.add(location); } } return extractedLocations; }
From source file:edu.stanford.muse.index.NEROld.java
License:Apache License
/** triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive). * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String) *//*from w w w .j a va 2 s .co m*/ private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets( String documentText, boolean locationsOnly, boolean orgsOnly, Map<String, Integer> locationCounts) { if (documentText.indexOf("\u00A0") > 0) documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages. // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER? if (REMOVE_I18N_CHARS) documentText = cleanI18NChars(documentText); final List<Pair<String, String>> tokensList = new ArrayList<Pair<String, String>>(); /* this does NER word by word, we prefer phrases, so use characterOffsets instead List<List<CoreLabel>> out = classifier.classify(documentText); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { String x = word.get(AnswerAnnotation.class); allTypes.add(x); if (x.equals("PERSON") || x.equals("ORGANIZATION") || x.equals("LOCATION")) { tokensList.add(word.word()); System.out.toString(word.word() + '/' + word.get(AnswerAnnotation.class) + ' '); } } System.out.println(); } */ try { NER.initialize(); } catch (Exception e) { Util.print_exception(e, log); } documentText = getSafeText(documentText); List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(documentText); for (Triple<String, Integer, Integer> t : triples) { String type = t.first(); if (type == null) type = "UNKNOWN"; // we see type = null sometimes #!@#$ allTypes.add(type); if (type.equals("PERSON") || type.equals("ORGANIZATION") || type.equals("LOCATION")) { String token = documentText.substring(t.second(), t.third()); // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already. if (token.toLowerCase().startsWith("hi ")) token = token.substring("hi ".length()).trim(); if (token.toLowerCase().startsWith("dear ")) token = token.substring("dear ".length()).trim(); if (token.length() > MAX_NAME_LENGTH) // drop it continue; if (locationsOnly) { if (type.equals("LOCATION")) { if (locations.containsKey(token.toLowerCase())) tokensList.add(new Pair<String, String>(token, type)); } } else if (orgsOnly) { if (type.equals("ORGANIZATION")) tokensList.add(new Pair<String, String>(token, type)); } else { tokensList.add(new Pair<String, String>(token, type)); if (locationCounts != null && type.equals("LOCATION")) { Integer I = locationCounts.get(token.toLowerCase()); locationCounts.put(token.toLowerCase(), (I == null) ? 1 : I + 1); } } } // System.out.println (t.first() + " : [" + t.second() + ":" + t.third() + "] " + documentText.substring(t.second(), t.third())); } return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(tokensList), triples); }
From source file:edu.stanford.muse.index.NEROld.java
License:Apache License
public static String retainOnlyNames(String text, List<Triple<String, Integer, Integer>> offsets) { if (offsets == null) return retainOnlyNames(text); // be forgiving int len = text.length(); offsets.add(new Triple<String, Integer, Integer>(null, len, len)); // sentinel int prev_name_end_pos = 0; // pos of first char after previous name StringBuilder result = new StringBuilder(); for (Triple<String, Integer, Integer> t : offsets) { int begin_pos = t.second(); int end_pos = t.third(); if (begin_pos > len || end_pos > len) { // TODO: this is unclean. currently happens because we concat body & title together when we previously generated these offsets but now we only have body. begin_pos = end_pos = len;//from w w w .j a va2 s .c om } String filler = text.substring(prev_name_end_pos, begin_pos); //filler = filler.replaceAll("\\w", "."); // CRITICAL: \w only matches (redacts) english language filler = filler.replaceAll("[^\\p{Punct}\\s]", "."); result.append(filler); result.append(text.substring(begin_pos, end_pos)); prev_name_end_pos = end_pos; } return result.toString(); }
From source file:edu.usc.polar.CoreNLP.java
public static void StanfordCoreNLP(String doc, String args[]) { try {/*from ww w. ja v a2 s.c o m*/ String text; AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); if (args.length > 0) { serializedClassifier = args[0]; } if (args.length > 1) { String fileContents = IOUtils.slurpFile(args[1]); List<List<CoreLabel>> out = classifier.classify(fileContents); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } out = classifier.classifyFile(args[1]); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } } else { InputStream stream = new FileInputStream(doc); //ParsingExample.class.getResourceAsStream(doc) ; // System.out.println(stream.toString()); parser.parse(stream, handler, metadata); // return handler.toString(); text = handler.toString(); String metaValue = metadata.toString(); // System.out.println("Desc:: "+metadata.get("description")); String[] example = new String[1]; example[0] = text; String name = doc.replace("C:\\Users\\Snehal\\Documents\\TREC-Data\\Data", "polar.usc.edu") .replace("\\", "."); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); JSONObject jsonObj = new JSONObject(); jsonObj.put("DOI", name); jsonObj.put("metadata", metaValue.replaceAll("\\s\\s+|\n|\t", " ")); JSONArray tempArray = new JSONArray(); JSONObject tempObj = new JSONObject(); for (Triple<String, Integer, Integer> item : list) { // String jsonOut="{ DOI:"+name+" ," // + ""+item.first() + "\": \"" + text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t"," ")+"\"" // + "\"metadata\":\""+metaValue+"\"" // + "}"; // System.out.println(jsonOut); tempObj.put(item.first(), text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t", " ")); } tempArray.add(tempObj); jsonObj.put("NER", tempArray); jsonArray.add(jsonObj); } // System.out.println("---"); } catch (Exception e) { System.out.println("ERROR : CoreNLP" + "|File Name" + doc.replaceAll("C:\\Users\\Snehal\\Documents\\TREC-Data", "") + " direct" + e.toString()); } }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<String> getPersons(String text) { List<String> persons = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(PERSON) == 0) { persons.add(text.substring(item.second(), item.third())); }/* w w w. j ava2 s . c o m*/ } return persons; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<String> getLocations(String text) { List<String> locations = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(LOCATION) == 0) { locations.add(text.substring(item.second(), item.third())); }//from w w w . j a v a 2s .co m } return locations; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<String> getOrganizations(String text) { List<String> organizations = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(ORGANIZATION) == 0) { organizations.add(text.substring(item.second(), item.third())); }/*from ww w . ja va 2 s . com*/ } return organizations; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public Map<String, List<String>> getAll(String text) { Map<String, List<String>> result = new HashMap<>(); result.put(PERSON, new LinkedList<String>()); result.put(ORGANIZATION, new LinkedList<String>()); result.put(LOCATION, new LinkedList<String>()); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(LOCATION) == 0) { result.get(LOCATION).add(text.substring(item.second(), item.third())); } else if (item.first().compareTo(ORGANIZATION) == 0) { result.get(ORGANIZATION).add(text.substring(item.second(), item.third())); } else if (item.first().compareTo(PERSON) == 0) { result.get(PERSON).add(text.substring(item.second(), item.third())); }/*from w w w. ja va2 s .com*/ } return result; }
From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java
public List<Triple<String, Integer, Integer>> getPersonMarkers(String text) { List<Triple<String, Integer, Integer>> personsOnlyList = new ArrayList<>(); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text); for (Triple<String, Integer, Integer> item : list) { if (item.first().compareTo(PERSON) == 0) { String name = text.substring(item.second(), item.third()); personsOnlyList.add(new Triple(name, item.second(), item.third())); }/* w ww.j a v a 2 s. c om*/ } return personsOnlyList; }