gr.demokritos.iit.textforms.json.Tweet.java Source code

Introduction

Here is the source code for gr.demokritos.iit.textforms.json.Tweet.java
Source

package gr.demokritos.iit.textforms.json;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonSyntaxException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.commons.lang3.StringUtils;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
/**
 * Model for the tweet, comes included with cleaning functions
 *
 * @author Gregory
 */
public class Tweet extends JsonForm {
    // Labels of fields we want in json
    public static final String ID_LABEL = "id_str";
    public static final String TEXT_LABEL = "text";
    public static final String CREATED_LABEL = "created_at";
    public static final String LANG_LABEL = "lang";
    public static final String ENTITIES_LABEL = "entities";
    public static final String HASHTAGS_LABEL = "hashtags";
    public static final String URLS_LABEL = "urls";
    public static final String MENTIONS_LABEL = "user_mentions";
    public static final String NAME_LABEL = "name";
    public static final String EXPURL_LABEL = "expanded_url";

    private ArrayList<String> mentions;
    private ArrayList<String> hashtags;
    private ArrayList<String> urls;
    private String lang;

    public Tweet(String id, String text, String date_created, String lang, JsonObject json) {
        super(id, text, date_created, json);
        this.lang = lang;
        this.mentions = new ArrayList();
        this.hashtags = new ArrayList();
        this.urls = new ArrayList();
    }

    public Tweet(String id, String text, String date_created, String lang, JsonObject json,
            ArrayList<String> mentions, ArrayList<String> hashtags, ArrayList<String> urls) {
        super(id, text, date_created, json);
        this.lang = lang;
        this.mentions = mentions;
        this.hashtags = hashtags;
        this.urls = urls;
    }

    /**
     * Pretty much self explanatory, print the tweet
     */
    public void prettyPrint() {
        System.out.println("tweetId: " + id);
        System.out.println("text: " + text);
        System.out.println("created_at: " + date_created);
        System.out.println("lang: " + lang);
        System.out.println("hashtags: " + hashtags.toString());
        System.out.println("mentions: " + mentions.toString());
        System.out.println("urls: " + urls.toString());
    }

    /**
     * Get a new instance of this class from a json representation of it.
     * @param json a json string containing the class attributes.
     * @return a new instance of Tweet.
     */
    public static Tweet fromFormat(String json) throws InvalidJsonException {
        try {
            JsonElement json_element = new JsonParser().parse(json);
            JsonObject json_object = json_element.getAsJsonObject();
            String id = json_object.get(ID_LABEL).getAsString();
            String text = json_object.get(TEXT_LABEL).getAsString();
            String date_created = json_object.get(CREATED_LABEL).getAsString();
            String lang = json_object.get(LANG_LABEL).getAsString();
            // populate tags stuff
            JsonObject entities = json_object.get(ENTITIES_LABEL).getAsJsonObject();
            // ---------------Start--------------------
            // added below to parse the change from array to dictionary
            // with indices
            // JsonObject mention_object = entities.get(MENTIONS_LABEL).getAsJsonObject();
            // ArrayList<String> mentions = new ArrayList();
            // int index = 0;
            // while we find indexes in mentions, continue
            // while(true){
            // String index_value = String.valueOf(index);
            // if(mention_object.has(index_value)){
            // JsonObject mention = mention_object.getAsJsonObject(index_value);
            // mentions.add(mention.get(NAME_LABEL).getAsString());
            // }
            // else{
            // break;
            // }
            // index++;
            // }
            // -------------------End----------------------------
            // Maybe below wasn't deprecated after all - replace from -Start- if
            // discovered otherwise
            // deprecated - mentions is no longer a list, it is a dictionary with indices
            ArrayList<String> mentions = extractInner(entities, MENTIONS_LABEL, NAME_LABEL);
            ArrayList<String> hashtags = extractInner(entities, HASHTAGS_LABEL, TEXT_LABEL);
            ArrayList<String> urls = extractInner(entities, URLS_LABEL, EXPURL_LABEL);
            return new Tweet(id, text, date_created, lang, json_object, mentions, hashtags, urls);
        } catch (NullPointerException | JsonSyntaxException e) {
            e.printStackTrace();
            throw new InvalidJsonException();
        }
    }

    /**
     * Helper function
     * @param entities
     * @param array_label
     * @param inner_label
     * @return 
     */
    private static ArrayList<String> extractInner(JsonObject entities, String array_label, String inner_label) {
        ArrayList<String> extracted = new ArrayList();
        JsonArray extract_array = entities.get(array_label).getAsJsonArray();
        for (JsonElement el : extract_array) {
            JsonObject extract = el.getAsJsonObject();
            extracted.add(extract.get(inner_label).toString());
        }
        return extracted;

    }

    @Override
    // TODO: decouple sentence split and cleaning - this used to be called 
    // cleanSentences.
    public ArrayList<String> sentenceSplit() {
        removeHTML();
        String[] tokens = tokenizeSentence();
        swapSymbols(tokens);
        // remove consecutive tags from the beginning
        cleanHead(tokens);
        // remove consecutive tags from the end
        cleanTail(tokens);
        // change tags in body for better ner
        cleanBody(tokens);
        return rebuildSentences(tokens);
    }

    private void swapSymbols(String[] tokens) {
        Iterator ith = this.hashtags.iterator();
        Iterator itm = this.mentions.iterator();

        for (int i = 0; i < tokens.length; i++) {
            if (tokens[i].startsWith("@") && itm.hasNext()) {
                tokens[i] = "@" + StringUtils.capitalize(itm.next().toString());
            } else if (tokens[i].startsWith("#") && ith.hasNext()) {
                tokens[i] = "#" + StringUtils.capitalize(ith.next().toString());
            }
        }
    }

    /**
     * Remove consecutive tags at the beginning of the sentence. Stops when no 
     * tag is found so it doesn't necessarily run for all the text.
     * @param tokens The "words" of the sentence
     */
    private void cleanHead(String[] tokens) { //Removing from the start
        for (int i = 0; i < tokens.length - 1; i++) {
            if (tokens[i].startsWith("#")) {
                tokens[i] = "";
            } else if (tokens[i].startsWith("@")) {
                //Don't touch
            } else if (tokens[i].matches("http.*")) {
                tokens[i] = "";
            } else if (tokens[i].matches("(RT)")) {
                tokens[i] = ""; // It is always followed by an @ mention
                if (i + 1 < tokens.length - 1) {
                    tokens[i + 1] = "";
                    i++;
                }
            } else {
                break;
            }
        }
    }

    /**
     * Remove consecutive tags from the end of the sentence. Stops when no
     * tag is found, so it doesn't necessarily run for all the text.
     * @param tokens The "words" of the sentence
     */
    private void cleanTail(String[] tokens) {
        for (int i = tokens.length - 1; i > 0; i--) {
            if (tokens[i].startsWith("#")) {
                tokens[i] = "";
            } else if (tokens[i].startsWith("@")) {
                //Don't touch
            } else if (tokens[i].matches("http.*")) {
                tokens[i] = "";
            } else {
                break;
            }
        }
    }

    /**
     * Replace the tags in the text with a nicer form for
     * Named Entity Recognition.
     * @param tokens The "words" of the sentence
     */
    private void cleanBody(String[] tokens) {
        for (int i = 0; i < tokens.length; i++) {
            if (tokens[i].startsWith("@")) {
                tokens[i] = removeCamelCase(tokens[i].replace("@", "")).trim();
            } else if (tokens[i].startsWith("#")) {
                tokens[i] = removeCamelCase(tokens[i].replace("#", "")).trim();
            } else if (tokens[i].startsWith("!") || tokens[i].startsWith("?")) {
                tokens[i] = tokens[i].replaceAll("!+", "");
                tokens[i] = tokens[i].replaceAll("\\?+", "");
                if (i - 1 >= 0) {
                    tokens[i - 1] = tokens[i - 1] + ".";
                }
            } else if (tokens[i].matches("http.*")) {
                tokens[i] = "";
            }
        }
    }
}