Example usage for edu.stanford.nlp.util Triple first

List of usage examples for edu.stanford.nlp.util Triple first

Introduction

In this page you can find the example usage for edu.stanford.nlp.util Triple first.

Prototype

T1 first

To view the source code for edu.stanford.nlp.util Triple first.

Click Source Link

Usage

From source file:edu.stanford.muse.index.NEROld.java

License:Apache License

/** triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive).
 * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String)
 *///from   www  .j  av  a  2  s .c o m
private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets(
        String documentText, boolean locationsOnly, boolean orgsOnly, Map<String, Integer> locationCounts) {
    if (documentText.indexOf("\u00A0") > 0)
        documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages.

    // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER?
    if (REMOVE_I18N_CHARS)
        documentText = cleanI18NChars(documentText);

    final List<Pair<String, String>> tokensList = new ArrayList<Pair<String, String>>();

    /* this does NER word by word, we prefer phrases, so use characterOffsets instead
    List<List<CoreLabel>> out = classifier.classify(documentText);
     for (List<CoreLabel> sentence : out)
     {
         for (CoreLabel word : sentence)
         {
      String x = word.get(AnswerAnnotation.class);
      allTypes.add(x);
      if (x.equals("PERSON") || x.equals("ORGANIZATION") || x.equals("LOCATION"))
      {
         tokensList.add(word.word());
         System.out.toString(word.word() + '/' + word.get(AnswerAnnotation.class) + ' ');
      }
         }
         System.out.println();
      }
      */

    try {
        NER.initialize();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }

    documentText = getSafeText(documentText);
    List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(documentText);
    for (Triple<String, Integer, Integer> t : triples) {
        String type = t.first();
        if (type == null)
            type = "UNKNOWN"; // we see type = null sometimes #!@#$
        allTypes.add(type);
        if (type.equals("PERSON") || type.equals("ORGANIZATION") || type.equals("LOCATION")) {
            String token = documentText.substring(t.second(), t.third());
            // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already.
            if (token.toLowerCase().startsWith("hi "))
                token = token.substring("hi ".length()).trim();
            if (token.toLowerCase().startsWith("dear "))
                token = token.substring("dear ".length()).trim();
            if (token.length() > MAX_NAME_LENGTH) // drop it
                continue;
            if (locationsOnly) {
                if (type.equals("LOCATION")) {
                    if (locations.containsKey(token.toLowerCase()))
                        tokensList.add(new Pair<String, String>(token, type));
                }
            } else if (orgsOnly) {
                if (type.equals("ORGANIZATION"))
                    tokensList.add(new Pair<String, String>(token, type));
            } else {
                tokensList.add(new Pair<String, String>(token, type));
                if (locationCounts != null && type.equals("LOCATION")) {
                    Integer I = locationCounts.get(token.toLowerCase());
                    locationCounts.put(token.toLowerCase(), (I == null) ? 1 : I + 1);
                }
            }
        }

        //          System.out.println (t.first() + " : [" + t.second() + ":" + t.third() + "] " + documentText.substring(t.second(), t.third()));
    }

    return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(tokensList), triples);
}

From source file:edu.usc.polar.CoreNLP.java

public static void StanfordCoreNLP(String doc, String args[]) {
    try {/*from   w w w. ja  va2  s  .  c o  m*/
        String text;
        AutoDetectParser parser = new AutoDetectParser();
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        if (args.length > 0) {
            serializedClassifier = args[0];
        }

        if (args.length > 1) {
            String fileContents = IOUtils.slurpFile(args[1]);
            List<List<CoreLabel>> out = classifier.classify(fileContents);
            for (List<CoreLabel> sentence : out) {
                for (CoreLabel word : sentence) {
                    System.out
                            .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
                }
                System.out.println();
            }

            out = classifier.classifyFile(args[1]);
            for (List<CoreLabel> sentence : out) {
                for (CoreLabel word : sentence) {
                    System.out
                            .print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
                }
                System.out.println();
            }

        } else {

            InputStream stream = new FileInputStream(doc);
            //ParsingExample.class.getResourceAsStream(doc) ;
            //   System.out.println(stream.toString());
            parser.parse(stream, handler, metadata);
            // return handler.toString();
            text = handler.toString();
            String metaValue = metadata.toString();
            // System.out.println("Desc:: "+metadata.get("description"));

            String[] example = new String[1];
            example[0] = text;
            String name = doc.replace("C:\\Users\\Snehal\\Documents\\TREC-Data\\Data", "polar.usc.edu")
                    .replace("\\", ".");
            List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
            JSONObject jsonObj = new JSONObject();
            jsonObj.put("DOI", name);
            jsonObj.put("metadata", metaValue.replaceAll("\\s\\s+|\n|\t", " "));
            JSONArray tempArray = new JSONArray();
            JSONObject tempObj = new JSONObject();
            for (Triple<String, Integer, Integer> item : list) {
                //          String jsonOut="{ DOI:"+name+"  ,"
                //                + ""+item.first() + "\": \"" + text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t"," ")+"\""
                //                + "\"metadata\":\""+metaValue+"\""
                //                + "}";
                // System.out.println(jsonOut);
                tempObj.put(item.first(),
                        text.substring(item.second(), item.third()).replaceAll("\\s\\s+|\n|\t", " "));
            }
            tempArray.add(tempObj);
            jsonObj.put("NER", tempArray);
            jsonArray.add(jsonObj);
        }
        // System.out.println("---");

    } catch (Exception e) {
        System.out.println("ERROR : CoreNLP" + "|File Name"
                + doc.replaceAll("C:\\Users\\Snehal\\Documents\\TREC-Data", "") + " direct" + e.toString());
    }
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<String> getPersons(String text) {
    List<String> persons = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(PERSON) == 0) {
            persons.add(text.substring(item.second(), item.third()));
        }//from w  w  w. j  a v a2 s . c o m
    }
    return persons;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<String> getLocations(String text) {
    List<String> locations = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(LOCATION) == 0) {
            locations.add(text.substring(item.second(), item.third()));
        }//from www. jav a2  s  .c om
    }
    return locations;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<String> getOrganizations(String text) {
    List<String> organizations = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(ORGANIZATION) == 0) {
            organizations.add(text.substring(item.second(), item.third()));
        }/*www.  j  av  a 2 s . com*/
    }
    return organizations;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public Map<String, List<String>> getAll(String text) {

    Map<String, List<String>> result = new HashMap<>();
    result.put(PERSON, new LinkedList<String>());
    result.put(ORGANIZATION, new LinkedList<String>());
    result.put(LOCATION, new LinkedList<String>());
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(LOCATION) == 0) {
            result.get(LOCATION).add(text.substring(item.second(), item.third()));
        } else if (item.first().compareTo(ORGANIZATION) == 0) {
            result.get(ORGANIZATION).add(text.substring(item.second(), item.third()));
        } else if (item.first().compareTo(PERSON) == 0) {
            result.get(PERSON).add(text.substring(item.second(), item.third()));
        }/* w w w.  j ava  2 s .c  om*/
    }
    return result;
}

From source file:es.dmr.flink.nlp.StanfordNLPCoreExtractor.java

public List<Triple<String, Integer, Integer>> getPersonMarkers(String text) {
    List<Triple<String, Integer, Integer>> personsOnlyList = new ArrayList<>();
    List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(text);
    for (Triple<String, Integer, Integer> item : list) {
        if (item.first().compareTo(PERSON) == 0) {
            String name = text.substring(item.second(), item.third());
            personsOnlyList.add(new Triple(name, item.second(), item.third()));
        }//ww  w.  j av  a  2s  .  c  o m
    }
    return personsOnlyList;
}

From source file:fire.NERDemo.java

public static void main(String[] args) throws Exception {

    String serializedClassifier = "C:\\Users\\DIPANAKR\\Desktop\\Satanu\\fire\\stanford-ner-2015-04-20\\stanford-ner-2015-04-20\\classifiers\\english.all.3class.distsim.crf.ser.gz";

    if (args.length > 0) {
        serializedClassifier = args[0];//from w ww  .ja v a  2s  .  co m
    }

    AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier);

    /* For either a file to annotate or for the hardcoded text example, this
       demo file shows several ways to process the input, for teaching purposes.
    */

    if (args.length > 1) {

        /* For the file, it shows (1) how to run NER on a String, (2) how
           to get the entities in the String with character offsets, and
           (3) how to run NER on a whole file (without loading it into a String).
        */

        String fileContents = IOUtils.slurpFile(args[1]);
        List<List<CoreLabel>> out = classifier.classify(fileContents);
        for (List<CoreLabel> sentence : out) {
            for (CoreLabel word : sentence) {
                System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
            }
            System.out.println();
        }

        System.out.println("---");
        out = classifier.classifyFile(args[1]);
        for (List<CoreLabel> sentence : out) {
            for (CoreLabel word : sentence) {
                System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
            }
            System.out.println();
        }

        System.out.println("---");
        List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents);
        for (Triple<String, Integer, Integer> item : list) {
            System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third()));
        }
        System.out.println("---");
        System.out.println("Ten best entity labelings");
        DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter();
        classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);

        System.out.println("---");
        System.out.println("Per-token marginalized probabilities");
        classifier.printProbs(args[1], readerAndWriter);

        // -- This code prints out the first order (token pair) clique probabilities.
        // -- But that output is a bit overwhelming, so we leave it commented out by default.
        // System.out.println("---");
        // System.out.println("First Order Clique Probabilities");
        // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);

    } else {

        /* For the hard-coded String, it shows how to run it on a single
           sentence, and how to do this and produce several formats, including
           slash tags and an inline XML output format. It also shows the full
           contents of the {@code CoreLabel}s that are constructed by the
           classifier. And it shows getting out the probabilities of different
           assignments and an n-best list of classifications with probabilities.
        */

        String[] example = { "Good afternoon Rajat Raina, how are you today?",
                "I go to school at Stanford University, which is located in California." };
        for (String str : example) {
            System.out.println(classifier.classifyToString(str));
        }
        System.out.println("---");

        for (String str : example) {
            // This one puts in spaces and newlines between tokens, so just print not println.
            System.out.print(classifier.classifyToString(str, "slashTags", false));
        }
        System.out.println("---");

        for (String str : example) {
            // This one is best for dealing with the output as a TSV (tab-separated column) file.
            // The first column gives entities, the second their classes, and the third the remaining text in a document
            System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
        }
        System.out.println("---");

        for (String str : example) {
            System.out.println(classifier.classifyWithInlineXML(str));
        }
        System.out.println("---");

        for (String str : example) {
            System.out.println(classifier.classifyToString(str, "xml", true));
        }
        System.out.println("---");

        for (String str : example) {
            System.out.print(classifier.classifyToString(str, "tsv", false));
        }
        System.out.println("---");

        // This gets out entities with character offsets
        int j = 0;
        for (String str : example) {
            j++;
            List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str);
            for (Triple<String, Integer, Integer> trip : triples) {
                System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n", trip.first(),
                        trip.second(), trip.third, j);
            }
        }
        System.out.println("---");

        // This prints out all the details of what is stored for each token
        int i = 0;
        for (String str : example) {
            for (List<CoreLabel> lcl : classifier.classify(str)) {
                for (CoreLabel cl : lcl) {
                    System.out.print(i++ + ": ");
                    System.out.println(cl.toShorterString());
                }
            }
        }

        System.out.println("---");

    }
}

From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnescorer.StanfordNlpNeScorerNodeModel.java

License:Open Source License

/**
 * {@inheritDoc}//from w  ww.j  ava  2 s .  co m
 */
@Override
protected PortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec) throws Exception {

    m_inputModelPortObject = (StanfordNERModelPortObject) inObjects[1];
    m_inputModel = m_inputModelPortObject.getNERModel();
    m_usedDict = m_inputModelPortObject.getDictSet();
    m_tag = m_inputModelPortObject.getTag();
    m_tokenizerName = m_inputModelPortObject.getTokenizerName();

    //create a BufferedDataContainer for the scoring values
    BufferedDataContainer accTable = exec.createDataContainer(new DataTableSpec(QUALITY_MEASURES_SPECS));

    // build pattern set from dictionary
    DataTableSpec docTableSpec = (DataTableSpec) inObjects[0].getSpec();
    BufferedDataTable docDataInput = (BufferedDataTable) inObjects[0];
    Set<Pattern> knownEntitiesPatternSet = new LinkedHashSet<Pattern>();
    for (String word : m_usedDict) {
        knownEntitiesPatternSet.add(Pattern.compile(word));
    }

    // create dictionary tagger to tag the input documents with the dictionary used for building the model
    MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, m_tag,
            true, m_tokenizerName);

    // create UUID to add them to the file path to avoid cases where two instances of the node model used the same file path at the same time
    String tempDir = KNIMEConstants.getKNIMETempDir() + "/";
    String m_annotatedTestFilePath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv";

    // create the annotated test file
    File m_annotatedTestFile = new File(m_annotatedTestFilePath);
    PrintWriter sentenceFileWriter = new PrintWriter(m_annotatedTestFile, "UTF-8");

    int missingValueCounter = 0;

    // tag documents and transform sentences to strings while tagged terms get StanfordNLP annotation
    // iterate through columns
    for (int i = 0; i < docTableSpec.getNumColumns(); i++) {
        // iterate through rows if column with correct name has been found
        if (docTableSpec.getColumnSpec(i).getName().equals(m_docColumnModel.getStringValue())) {
            int counter = 0;
            Set<String> countMultiWordTerms = new HashSet<String>();
            for (DataRow row : docDataInput) {
                //set progress bar
                counter++;
                double progress = (counter / (double) docDataInput.size()) / (3.0);
                exec.setProgress(progress, "Preparing documents for validation");
                exec.checkCanceled();

                if (!row.getCell(i).isMissing() && row.getCell(i).getType().isCompatible(DocumentValue.class)) {
                    Document doc = ((DocumentValue) row.getCell(i)).getDocument();
                    Document taggedDoc = tagger.tag(doc);
                    Iterator<Sentence> si = taggedDoc.sentenceIterator();
                    while (si.hasNext()) {
                        Sentence s = si.next();
                        List<Term> termList = s.getTerms();
                        Iterator<Term> ti = termList.iterator();
                        while (ti.hasNext()) {
                            Term t = ti.next();
                            String termText = t.getText();
                            String termTextWithWsSuffix = t.getTextWithWsSuffix();
                            if (m_usedDict.contains(termText) || m_usedDict.contains(termTextWithWsSuffix)) {
                                if (t.getWords().size() > 1) {
                                    // multi-word terms should not be written in one line in the training file
                                    countMultiWordTerms.add(t.getText());

                                    // so skip it by splitting the term and writing each word in one line
                                    for (Word w : t.getWords()) {
                                        sentenceFileWriter.println(w.getText() + "\t" + m_tag.getTagValue());
                                    }
                                } else {
                                    sentenceFileWriter.println(termText + "\t" + m_tag.getTagValue());
                                }
                            } else if (!m_usedDict.contains(termText)
                                    || !m_usedDict.contains(termTextWithWsSuffix)) {
                                sentenceFileWriter.println(termText + "\tO");
                            }
                        }
                    }
                } else {
                    missingValueCounter++;
                }
            }
        }
    }

    if (missingValueCounter == 1) {
        setWarningMessage(missingValueCounter + " row has been ignored due to missing value.");
    } else if (missingValueCounter > 1) {
        setWarningMessage(missingValueCounter + " rows have been ignored due to missing values.");
    }

    sentenceFileWriter.close();

    exec.setProgress(0.5, "Validate model");
    // create logger configuration and catch the scores which will be printed to the log file
    File tmpLogFile = new File(KNIMEConstants.getKNIMETempDir() + "/scores.log");
    RedwoodConfiguration conf = RedwoodConfiguration.empty();
    conf.handlers(Handlers.chain(Handlers.hideDebug, Handlers.file(tmpLogFile))).apply();

    // classify the documents with our model
    DocumentReaderAndWriter<CoreLabel> raw = m_inputModel.makeReaderAndWriter();
    Triple<Double, Double, Double> prfScores = m_inputModel.classifyAndWriteAnswers(m_annotatedTestFilePath,
            new ByteArrayOutputStream(), raw, true);

    DataRow stats = new DefaultRow(new RowKey("Row0"),
            new DataCell[] { DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(),
                    DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell() });

    ReversedLinesFileReader logReader = new ReversedLinesFileReader(tmpLogFile, StandardCharsets.UTF_8);

    try {
        // get values from output stream
        String[] scores = logReader.readLine().split("\t");
        if (scores.length >= 7) {
            Double precision = prfScores.first() / 100;
            Double recall = prfScores.second() / 100;
            Double f1 = prfScores.third() / 100;
            int tp = Integer.parseInt(scores[4].trim());
            int fp = Integer.parseInt(scores[5].trim());
            int fn = Integer.parseInt(scores[6].trim());
            // create the scores row and add it to the BufferedDataContainer we created in the beginning
            stats = new DefaultRow(new RowKey("Row0"),
                    new DataCell[] { new DoubleCell(precision), new DoubleCell(recall), new DoubleCell(f1),
                            new IntCell(tp), new IntCell(fp), new IntCell(fn) });
            if (tp == 0 && fp == 0 && fn == 0 && precision == 0 && recall == 1 && f1 == 0) {
                setWarningMessage("Could not parse quality measures of model validation.");
            }
        }
    } catch (NumberFormatException e) {
        setWarningMessage("Could not parse quality measures of model validation.");
    } finally {
        logReader.close();
        tmpLogFile.delete();
        m_annotatedTestFile.delete();
    }
    accTable.addRowToTable(stats);

    accTable.close();

    return new BufferedDataTable[] { accTable.getTable() };
}

From source file:org.wso2.toolbox.nlp.CountryFunctionExecutor.java

License:Open Source License

public String recognizeEntity(String locationStr) {

    String countryCode = null;/*w  ww  . ja  va 2 s  .  c o m*/

    if (locationStr != null) {

        String locationUpperStr = locationStr.toUpperCase();

        List<Triple<String, Integer, Integer>> items = countryCodeTable.getClassifier()
                .classifyToCharacterOffsets(locationUpperStr);

        for (Triple<String, Integer, Integer> item : items) {
            if ("Location".equalsIgnoreCase(item.first())) {
                String countryName = locationUpperStr.substring(item.second, item.third);

                if (countryCodeTable.getCountryCodeList().contains(countryName)) {
                    countryCode = countryName;
                } else {
                    countryCode = (String) countryCodeTable.getCountryCode(countryName);
                }
            }
            if (countryCode != null) {
                break;
            }
        }
    }

    if (countryCode != null) {
        return countryCode;
    } else {
        return "";
    }
}