Example usage for edu.stanford.nlp.ie AbstractSequenceClassifier classifyToCharacterOffsets

List of usage examples for edu.stanford.nlp.ie AbstractSequenceClassifier classifyToCharacterOffsets

Introduction

In this page you can find the example usage for edu.stanford.nlp.ie AbstractSequenceClassifier classifyToCharacterOffsets.

Prototype

public List<Triple<String, Integer, Integer>> classifyToCharacterOffsets(String sentences) 

Source Link

Document

Classify the contents of a String to classified character offset spans.

Usage

From source file:com.bericotech.clavin.nerd.WorkflowDemoNERD.java

License:Open Source License

/**
 * Sometimes, you might already be using Stanford NER elsewhere in
 * your application, and you'd like to just pass the output from
 * Stanford NER directly into CLAVIN, without having to re-run the
 * input through Stanford NER just to use CLAVIN. This example
 * shows you how to very easily do exactly that.
 *
 * @throws IOException//from   ww w  .  j  ava2 s . c o  m
 * @throws ClavinException
 */
private static void resolveStanfordEntities() throws IOException, ClavinException {

    /*#####################################################################
     *
     * Start with Stanford NER -- no need to get CLAVIN involved for now.
     *
     *###################################################################*/

    // instantiate Stanford NER entity extractor
    InputStream mpis = WorkflowDemoNERD.class.getClassLoader()
            .getResourceAsStream("models/english.all.3class.distsim.prop");
    Properties mp = new Properties();
    mp.load(mpis);
    AbstractSequenceClassifier<CoreMap> namedEntityRecognizer = CRFClassifier
            .getJarClassifier("/models/english.all.3class.distsim.crf.ser.gz", mp);

    // Unstructured text file about Somalia to be geoparsed
    File inputFile = new File("src/test/resources/sample-docs/Somalia-doc.txt");

    // Grab the contents of the text file as a String
    String inputString = TextUtils.fileToString(inputFile);

    // extract entities from input text using Stanford NER
    List<Triple<String, Integer, Integer>> entitiesFromNER = namedEntityRecognizer
            .classifyToCharacterOffsets(inputString);

    /*#####################################################################
     *
     * Now, CLAVIN comes into play...
     *
     *###################################################################*/

    // convert Stanford NER output to ClavinLocationResolver input
    List<LocationOccurrence> locationsForCLAVIN = convertNERtoCLAVIN(entitiesFromNER, inputString);

    // instantiate the CLAVIN location resolver
    ClavinLocationResolver clavinLocationResolver = new ClavinLocationResolver(
            new LuceneGazetteer(new File("./IndexDirectory")));

    // resolve location entities extracted from input text
    List<ResolvedLocation> resolvedLocations = clavinLocationResolver.resolveLocations(locationsForCLAVIN, 1, 1,
            false);

    // Display the ResolvedLocations found for the location names
    for (ResolvedLocation resolvedLocation : resolvedLocations)
        System.out.println(resolvedLocation);
}

From source file:fire.NERDemo.java

public static void main(String[] args) throws Exception {

    String serializedClassifier = "C:\\Users\\DIPANAKR\\Desktop\\Satanu\\fire\\stanford-ner-2015-04-20\\stanford-ner-2015-04-20\\classifiers\\english.all.3class.distsim.crf.ser.gz";

    if (args.length > 0) {
        serializedClassifier = args[0];//from ww w.  j a  va  2  s .  co  m
    }

    AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier);

    /* For either a file to annotate or for the hardcoded text example, this
       demo file shows several ways to process the input, for teaching purposes.
    */

    if (args.length > 1) {

        /* For the file, it shows (1) how to run NER on a String, (2) how
           to get the entities in the String with character offsets, and
           (3) how to run NER on a whole file (without loading it into a String).
        */

        String fileContents = IOUtils.slurpFile(args[1]);
        List<List<CoreLabel>> out = classifier.classify(fileContents);
        for (List<CoreLabel> sentence : out) {
            for (CoreLabel word : sentence) {
                System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
            }
            System.out.println();
        }

        System.out.println("---");
        out = classifier.classifyFile(args[1]);
        for (List<CoreLabel> sentence : out) {
            for (CoreLabel word : sentence) {
                System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
            }
            System.out.println();
        }

        System.out.println("---");
        List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents);
        for (Triple<String, Integer, Integer> item : list) {
            System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third()));
        }
        System.out.println("---");
        System.out.println("Ten best entity labelings");
        DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter();
        classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);

        System.out.println("---");
        System.out.println("Per-token marginalized probabilities");
        classifier.printProbs(args[1], readerAndWriter);

        // -- This code prints out the first order (token pair) clique probabilities.
        // -- But that output is a bit overwhelming, so we leave it commented out by default.
        // System.out.println("---");
        // System.out.println("First Order Clique Probabilities");
        // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);

    } else {

        /* For the hard-coded String, it shows how to run it on a single
           sentence, and how to do this and produce several formats, including
           slash tags and an inline XML output format. It also shows the full
           contents of the {@code CoreLabel}s that are constructed by the
           classifier. And it shows getting out the probabilities of different
           assignments and an n-best list of classifications with probabilities.
        */

        String[] example = { "Good afternoon Rajat Raina, how are you today?",
                "I go to school at Stanford University, which is located in California." };
        for (String str : example) {
            System.out.println(classifier.classifyToString(str));
        }
        System.out.println("---");

        for (String str : example) {
            // This one puts in spaces and newlines between tokens, so just print not println.
            System.out.print(classifier.classifyToString(str, "slashTags", false));
        }
        System.out.println("---");

        for (String str : example) {
            // This one is best for dealing with the output as a TSV (tab-separated column) file.
            // The first column gives entities, the second their classes, and the third the remaining text in a document
            System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
        }
        System.out.println("---");

        for (String str : example) {
            System.out.println(classifier.classifyWithInlineXML(str));
        }
        System.out.println("---");

        for (String str : example) {
            System.out.println(classifier.classifyToString(str, "xml", true));
        }
        System.out.println("---");

        for (String str : example) {
            System.out.print(classifier.classifyToString(str, "tsv", false));
        }
        System.out.println("---");

        // This gets out entities with character offsets
        int j = 0;
        for (String str : example) {
            j++;
            List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str);
            for (Triple<String, Integer, Integer> trip : triples) {
                System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n", trip.first(),
                        trip.second(), trip.third, j);
            }
        }
        System.out.println("---");

        // This prints out all the details of what is stored for each token
        int i = 0;
        for (String str : example) {
            for (List<CoreLabel> lcl : classifier.classify(str)) {
                for (CoreLabel cl : lcl) {
                    System.out.print(i++ + ": ");
                    System.out.println(cl.toShorterString());
                }
            }
        }

        System.out.println("---");

    }
}