Example usage for edu.stanford.nlp.util Triple second

List of usage examples for edu.stanford.nlp.util Triple second

Introduction

In this page you can find the example usage for edu.stanford.nlp.util Triple second.

Prototype

T2 second

To view the source code for edu.stanford.nlp.util Triple second.

Click Source Link

Usage

From source file:com.bericotech.clavin.nerd.StanfordExtractor.java

License:Open Source License

/**
 * Get extracted locations from a plain-text body.
 * //from  w  w w  .  jav  a  2 s . co m
 * @param text      Text content to perform extraction on.
 * @return          List of Location Occurrences.
 */
public List<LocationOccurrence> extractLocationNames(String text) {
    if (text == null)
        throw new IllegalArgumentException("text input to extractLocationNames should not be null");

    List<LocationOccurrence> extractedLocations = new ArrayList<LocationOccurrence>();

    // extract entities as <Entity Type, Start Index, Stop Index>
    List<Triple<String, Integer, Integer>> extractedEntities = namedEntityRecognizer
            .classifyToCharacterOffsets(text);

    if (extractedEntities != null) {
        // iterate over each entity Triple
        for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
            // check if the entity is a "Location"
            if (extractedEntity.first.equalsIgnoreCase("LOCATION")) {
                // build a LocationOccurrence object
                LocationOccurrence location = new LocationOccurrence(
                        text.substring(extractedEntity.second(), extractedEntity.third()),
                        extractedEntity.second());
                // filter out demonyms
                if (!demonyms.contains(location.text))
                    // add it to the list of extracted locations
                    extractedLocations.add(location);
            }
        }
    }

    return extractedLocations;
}

From source file:com.geocode.service.impl.CustomExtractor.java

License:Open Source License

/**
 * Get extracted locations from a plain-text body.
 * //from   ww w.  java  2 s  .  c  o  m
 * @param text      Text content to perform extraction on.
 * @return          List of Location Occurrences.
 */
public List<LocationOccurrence> extractLocationNames(String text) {
    if (text == null)
        throw new IllegalArgumentException("text input to extractLocationNames should not be null");

    List<LocationOccurrence> extractedLocations = new ArrayList<LocationOccurrence>();

    // extract entities as <Entity Type, Start Index, Stop Index>
    List<Triple<String, Integer, Integer>> extractedEntities = namedEntityRecognizer
            .classifyToCharacterOffsets(text);

    if (extractedEntities != null) {
        // iterate over each entity Triple
        for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
            // check if the entity is a "Location"
            // build a LocationOccurrence object
            LocationOccurrence location = new LocationOccurrence(
                    text.substring(extractedEntity.second(), extractedEntity.third()),
                    extractedEntity.second());
            // filter out demonyms
            if (!demonyms.contains(location.text))
                // add it to the list of extracted locations
                extractedLocations.add(location);

        }
    }

    return extractedLocations;
}

From source file:edu.stanford.muse.index.NEROld.java

License:Apache License

/** triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive).
 * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String)
 *///  w ww. j a va2 s.c  o m
private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets(
        String documentText, boolean locationsOnly, boolean orgsOnly, Map<String, Integer> locationCounts) {
    if (documentText.indexOf("\u00A0") > 0)
        documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages.

    // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER?
    if (REMOVE_I18N_CHARS)
        documentText = cleanI18NChars(documentText);

    final List<Pair<String, String>> tokensList = new ArrayList<Pair<String, String>>();

    /* this does NER word by word, we prefer phrases, so use characterOffsets instead
    List<List<CoreLabel>> out = classifier.classify(documentText);
     for (List<CoreLabel> sentence : out)
     {
         for (CoreLabel word : sentence)
         {
      String x = word.get(AnswerAnnotation.class);
      allTypes.add(x);
      if (x.equals("PERSON") || x.equals("ORGANIZATION") || x.equals("LOCATION"))
      {
         tokensList.add(word.word());
         System.out.toString(word.word() + '/' + word.get(AnswerAnnotation.class) + ' ');
      }
         }
         System.out.println();
      }
      */

    try {
        NER.initialize();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }

    documentText = getSafeText(documentText);
    List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(documentText);
    for (Triple<String, Integer, Integer> t : triples) {
        String type = t.first();
        if (type == null)
            type = "UNKNOWN"; // we see type = null sometimes #!@#$
        allTypes.add(type);
        if (type.equals("PERSON") || type.equals("ORGANIZATION") || type.equals("LOCATION")) {
            String token = documentText.substring(t.second(), t.third());
            // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already.
            if (token.toLowerCase().startsWith("hi "))
                token = token.substring("hi ".length()).trim();
            if (token.toLowerCase().startsWith("dear "))
                token = token.substring("dear ".length()).trim();
            if (token.length() > MAX_NAME_LENGTH) // drop it
                continue;
            if (locationsOnly) {
                if (type.equals("LOCATION")) {
                    if (locations.containsKey(token.toLowerCase()))
                        tokensList.add(new Pair<String, String>(token, type));
                }
            } else if (orgsOnly) {
                if (type.equals("ORGANIZATION"))
                    tokensList.add(new Pair<String, String>(token, type));
            } else {
                tokensList.add(new Pair<String, String>(token, type));
                if (locationCounts != null && type.equals("LOCATION")) {
                    Integer I = locationCounts.get(token.toLowerCase());
                    locationCounts.put(token.toLowerCase(), (I == null) ? 1 : I + 1);
                }
            }
        }

        //          System.out.println (t.first() + " : [" + t.second() + ":" + t.third() + "] " + documentText.substring(t.second(), t.third()));
    }

    return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(tokensList), triples);
}

From source file:edu.stanford.muse.index.NEROld.java

License:Apache License

public static String retainOnlyNames(String text, List<Triple<String, Integer, Integer>> offsets) {
    if (offsets == null)
        return retainOnlyNames(text); // be forgiving

    int len = text.length();
    offsets.add(new Triple<String, Integer, Integer>(null, len, len)); // sentinel
    int prev_name_end_pos = 0; // pos of first char after previous name
    StringBuilder result = new StringBuilder();
    for (Triple<String, Integer, Integer> t : offsets) {
        int begin_pos = t.second();
        int end_pos = t.third();
        if (begin_pos > len || end_pos > len) {
            // TODO: this is unclean. currently happens because we concat body & title together when we previously generated these offsets but now we only have body.
            begin_pos = end_pos = len;/*from   w ww.j  a v  a  2 s.c  o  m*/
        }
        String filler = text.substring(prev_name_end_pos, begin_pos);
        //filler = filler.replaceAll("\\w", "."); // CRITICAL: \w only matches (redacts) english language
        filler = filler.replaceAll("[^\\p{Punct}\\s]", ".");
        result.append(filler);
        result.append(text.substring(begin_pos, end_pos));
        prev_name_end_pos = end_pos;
    }

    return result.toString();
}

From source file:edu.usc.polar.CoreNLP.java

public static void StanfordCoreNLP(String doc, String args[]) {
    try {/*from w w w. j a  v  a 2  s.com*/
        String text;
        AutoDetectParser parser = new AutoDetectParser();
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        if (args.length > 0) {
            serializedClassifier = args[0];
        }

        if (args.length > 1) {
            String fileContents = IOUtils.slurpFile(args[1]);
            List<List<CoreLabel>> out = classifier.classify(fileContents);
            for (List<CoreLabel> sentence : out) {
                for (CoreLabel word : sentence) {
                    System.out
                            .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
                }
                System.out.println();
            }

            out = classifier.classifyFile(args[1]);
            for (List<CoreLabel> sentence : out) {
                for (CoreLabel word : sentence) {
                    System.out
                            .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
                }
                System.out.println();
            }

        } else {

            InputStream stream = new FileInputStream(doc);
            //ParsingExample.class.getResourceAsStream(doc) ;
            //   System.out.println(stream.toString());
            parser.parse(stream, handler, metadata);
            // return handler.toString();
            text = handler.toString();
            String metaValue = metadata.toString();
            // System.out.println("Desc:: "+metadata.get("description"));

            String[] example = new String[1];
            example[0] = text;
            String name = doc.replace("C:\\Users\\Snehal\\Documents\\TREC-Data\\Data", "polar.usc.edu")
                    .replace("\\", ".");
            List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
            JSONObject jsonObj = new JSONObject();
            jsonObj.put("DOI", name);
            jsonObj.put("metadata", metaValue.replaceAll("\\s\\s+|\n|\t", " "));
            JSONArray tempArray = new JSONArray();
            JSONObject tempObj = new JSONObject();
            for (Triple<String, Integer, Integer> item : list) {
                //          String jsonOut="{ DOI:"+name+"  ,"
                //                + ""+item.first() + "\": \"" + text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t"," ")+"\""
                //                + "\"metadata\":\""+metaValue+"\""
                //                + "}";
                // System.out.println(jsonOut);
                tempObj.put(item.first(),
                        text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t", " "));
            }
            tempArray.add(tempObj);
            jsonObj.put("NER", tempArray);
            jsonArray.add(jsonObj);
        }
        // System.out.println("---");

    } catch (Exception e) {
        System.out.println("ERROR : CoreNLP" + "|File Name"
                + doc.replaceAll("C:\\Users\\Snehal\\Documents\\TREC-Data", "") + " direct" + e.toString());
    }
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<String> getPersons(String text) {
    List<String> persons = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(PERSON) == 0) {
            persons.add(text.substring(item.second(), item.third()));
        }/*from  w  w  w. j a  va 2s .  co  m*/
    }
    return persons;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<String> getLocations(String text) {
    List<String> locations = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(LOCATION) == 0) {
            locations.add(text.substring(item.second(), item.third()));
        }// ww w .  jav a  2  s. com
    }
    return locations;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<String> getOrganizations(String text) {
    List<String> organizations = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(ORGANIZATION) == 0) {
            organizations.add(text.substring(item.second(), item.third()));
        }/*from w  w  w.j  ava  2 s.  com*/
    }
    return organizations;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public Map<String, List<String>> getAll(String text) {

    Map<String, List<String>> result = new HashMap<>();
    result.put(PERSON, new LinkedList<String>());
    result.put(ORGANIZATION, new LinkedList<String>());
    result.put(LOCATION, new LinkedList<String>());
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(LOCATION) == 0) {
            result.get(LOCATION).add(text.substring(item.second(), item.third()));
        } else if (item.first().compareTo(ORGANIZATION) == 0) {
            result.get(ORGANIZATION).add(text.substring(item.second(), item.third()));
        } else if (item.first().compareTo(PERSON) == 0) {
            result.get(PERSON).add(text.substring(item.second(), item.third()));
        }/*ww w  .j  a v  a  2s  .c o m*/
    }
    return result;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<Triple<String, Integer, Integer>> getPersonMarkers(String text) {
    List<Triple<String, Integer, Integer>> personsOnlyList = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(PERSON) == 0) {
            String name = text.substring(item.second(), item.third());
            personsOnlyList.add(new Triple(name, item.second(), item.third()));
        }//from   w ww .  j  av  a2 s  .com
    }
    return personsOnlyList;
}