edu.usc.ir.visualization.NLTKandCoreNLP.java Source code

Java tutorial

Introduction

Here is the source code for edu.usc.ir.visualization.NLTKandCoreNLP.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.usc.ir.visualization;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

import javax.ws.rs.core.MediaType;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ner.NamedEntityParser;
import org.apache.tika.parser.ner.nltk.NLTKNERecogniser;
import org.xml.sax.SAXException;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

public class NLTKandCoreNLP {

    private HashSet<String> freq;
    private HashMap<String, Integer> nltk;
    private HashMap<String, Integer> nlp;
    private JsonNode datasetElement;
    private Tika tika;
    private Metadata md;
    private ObjectMapper mapper;

    public NLTKandCoreNLP() {

        freq = new HashSet<String>();
        nltk = new HashMap<String, Integer>();
        nlp = new HashMap<String, Integer>();
        tika = null;
        datasetElement = null;
        mapper = new ObjectMapper();
        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
        try {
            tika = new Tika(new TikaConfig(NLTKandCoreNLP.class.getResourceAsStream("tika-config.xml")));
        } catch (TikaException | IOException | SAXException e) {
            System.out.println("Could not load Tika");
            e.printStackTrace();
        }
    }

    public static void main(String m[]) throws JsonParseException, JsonMappingException, IOException {

        String memexUrl = m[0];
        String username = m[1];
        String password = m[2];
        File destination = new File(m[3]);
        NLTKandCoreNLP obj = new NLTKandCoreNLP();
        obj.countNER(memexUrl, username, password);
        obj.createJSON(destination);

    }

    private void countNER(String memexUrl, String username, String password)
            throws JsonParseException, JsonMappingException, IOException {

        JsonNode node;
        JsonNode dataset = null;
        String url;
        String response;

        for (int c = 0; c < 101; c += 100) {

            url = memexUrl + "/select?q=gunsamerica&start=" + c
                    + "&rows=100&fl=content%2Corganizations%2Cpersons%2Cdates%2Clocations&wt=json&indent=true";
            response = WebClient.create(url, username, password, null).accept(MediaType.APPLICATION_JSON).get()
                    .readEntity(String.class);

            try {
                node = mapper.readTree(response);
                dataset = node.get("response").get("docs");
            } catch (JsonProcessingException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }

            Iterator<JsonNode> datasetElements = dataset.iterator();

            while (datasetElements.hasNext()) {
                datasetElement = datasetElements.next();
                String content = datasetElement.get("content").asText();
                md = new Metadata();
                try (InputStream stream = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8))) {
                    tika.parse(stream, md);
                } catch (IOException e) {
                    e.printStackTrace();
                }

                if (datasetElement.has("locations")) {
                    extract("locations");
                }

                if (datasetElement.has("dates")) {
                    extract("organizations");
                }

                if (datasetElement.has("organizations")) {
                    extract("organizations");
                }

                if (datasetElement.has("persons")) {
                    extract("persons");
                }

                if (md.getValues("NER_NAMES").length > 0) {
                    for (String ner_name : Arrays.asList(md.getValues("NER_NAMES"))) {
                        if (!freq.contains(ner_name)) {
                            freq.add(ner_name);
                        }
                        if (nltk.containsKey(ner_name)) {
                            nltk.put(ner_name, nltk.get(ner_name) + 1);
                        } else {
                            nltk.put(ner_name, 1);
                        }
                    }
                }
            }
        }
    }

    private void extract(String ner) throws JsonParseException, JsonMappingException, IOException {
        String names[] = null;
        names = mapper.readValue(datasetElement.get(ner).toString(), String[].class);
        for (int i = 0; i < names.length; i++) {
            if (!freq.contains(names[i])) {
                freq.add(names[i]);
            }
            if (nlp.containsKey(names[i])) {
                nlp.put(names[i], nlp.get(names[i]) + 1);
            } else {
                nlp.put(names[i], 1);
            }
        }
    }

    private void createJSON(File destination) throws JsonGenerationException, JsonMappingException, IOException {
        ArrayList<Names> frequencies = new ArrayList<Names>();
        for (String value : freq) {
            int x = nltk.containsKey(value) ? nltk.get(value) : 0;
            int y = nlp.containsKey(value) ? nlp.get(value) : 0;
            int z = x + y - Math.abs(x - y);
            if (z == 0) {
                z = x > y ? 0 : -y;
            }
            frequencies.add(new Names(value, z));
        }

        Collections.sort(frequencies, maximumOverlap);
        ArrayList<String> final_labels = new ArrayList<String>();
        ArrayList<Integer> nltk_value = new ArrayList<Integer>();
        ArrayList<Integer> nlp_value = new ArrayList<Integer>();
        for (int i = 0; i < frequencies.size(); i++) {
            String value = frequencies.get(i).name;
            final_labels.add(value);
            if (nltk.containsKey(value)) {
                nltk_value.add(nltk.get(value));
            } else {
                nltk_value.add(0);
            }
            if (nlp.containsKey(value)) {
                nlp_value.add(nlp.get(value));
            } else {
                nlp_value.add(0);
            }
        }

        Series[] s = { new Series("nltk", nltk_value), new Series("nlp", nlp_value) };
        Labels labels = new Labels(final_labels, s);
        ObjectMapper mapper = new ObjectMapper();
        destination = new File(destination.getAbsolutePath() + "/nltk_vs_corenlp.json");
        mapper.writerWithDefaultPrettyPrinter().writeValue(destination, labels);
        System.out.println("Json ready for Visualization: " + destination.getAbsolutePath());
    }

    public static Comparator<Names> maximumOverlap = new Comparator<Names>() {
        public int compare(Names one, Names two) {
            return (int) two.strength - (int) one.strength;
        }
    };

}