uk.ac.susx.tag.method51.text.Tokeniser.java Source code

Introduction

Here is the source code for uk.ac.susx.tag.method51.text.Tokeniser.java
Source

package uk.ac.susx.tag.method51.text;

/*
 * #%L
 * Tokeniser.java - method51 - University of Sussex - 2,013
 * %%
 * Copyright (C) 2013 - 2014 University of Sussex
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import cmu.arktweetnlp.Twokenize;
import com.google.common.io.Resources;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.susx.mlcl.lib.io.Files;
import uk.ac.susx.tag.method51.core.params.Params;
import uk.ac.susx.tag.method51.twitter.params.PathParms;

import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created with IntelliJ IDEA.
 * User: sw206
 * Date: 13/06/2013
 * Time: 13:45
 * To change this template use File | Settings | File Templates.
 */
public class Tokeniser {

    private static final Pattern punctuationPattern = Pattern
            .compile("^[\\u0021-\\u002f\\u003a-\\u0040\\u005b-\\u0060\\u007b-\\u007e\\u2000-\\u206f]+$");
    private static final Logger LOG = LoggerFactory.getLogger(Tokeniser.class);
    private static final Map<Document, List<String>> tokenisationCache = Collections
            .synchronizedMap(new WeakHashMap<Document, List<String>>());
    private static final Object2IntMap<String> vocab = new Object2IntOpenHashMap<>();
    private static final Int2ObjectMap<String> bacov = new Int2ObjectOpenHashMap<>();
    private final boolean removeHashes;
    private final boolean removeAts;
    private final boolean removePunctuation;
    private final boolean preserveURLs;
    private final Set<String> stopwords;
    private boolean lowercase;
    private final boolean hashtagsOnly;

    public Tokeniser(Options options) {
        this.removeAts = options.removeAts.get();
        this.removePunctuation = options.removePunctuation.get();
        this.removeHashes = options.removeHashes.get();
        this.preserveURLs = options.preserveURLs.get();
        hashtagsOnly = options.hashtagsOnly.get();

        if (options.filterStopwords.get()) {
            try {

                File stopwordFile = options.stopwordFile.get();

                String raw = Files
                        .getText(new InputStreamReader(
                                Resources.getResource(this.getClass(), stopwordFile.getPath()).openStream()), true)
                        .toString();

                if (lowercase) {

                    raw = raw.toLowerCase();
                }

                String[] words = raw.split("\n");

                stopwords = new ObjectOpenHashSet<>(words);

                if (options.filterStopwordsIncKeywords.get()) {
                    stopwords.addAll(options.keywords.get());
                }

            } catch (IOException e) {

                LOG.error("could not open stop word list file!", e);
                throw new RuntimeException(e);
            }
        } else {
            stopwords = Collections.emptySet();
        }
    }

    public static Map<Document, List<String>> getTokenisationCache() {
        return tokenisationCache;
    }

    public static String getString(int idx) {

        return bacov.get(idx);
    }

    public static int getIndex(String token) {

        if (vocab.containsKey(token)) {

            return vocab.getInt(token);
        } else {
            synchronized (vocab) {
                int idx = vocab.size();

                vocab.put(token, idx);
                bacov.put(idx, token);

                return idx;
            }
        }
    }

    public static Int2ObjectMap<String> getBacov() {
        return bacov;
    }

    public static Object2IntMap<String> getVocab() {
        return vocab;
    }

    public List<String> tokenise(Document doc) {

        if (tokenisationCache.containsKey(doc)) {

            return tokenisationCache.get(doc);
        } else {

            String text = doc.getText();

            return tokenise(text, null);
        }
    }

    public List<String> tokenise(String text) {
        return tokenise(text, null);
    }

    public List<String> tokenise(String text, String urls) {
        if (lowercase) {

            if (urls != null && preserveURLs && urls.length() > 0) {
                String[] bits = urls.split(" ");

                text = text.toLowerCase();
                int indexBegin = 0;
                int indexEnd = 0;
                for (int i = 0; i < bits.length; ++i) {

                    indexBegin = text.indexOf("http", indexBegin);
                    indexEnd = text.indexOf(" ", indexBegin);
                    indexEnd = indexEnd < 0 ? text.length() : indexEnd;
                    text = text.substring(0, indexBegin) + bits[i] + text.substring(indexEnd);
                }

            } else {
                text = text.toLowerCase();
            }

        }

        List<String> tokens = Twokenize.tokenizeRawTweetText(text);

        //String[] tokens = text.split(" ");
        List<String> out = new ArrayList<>();

        for (String token : tokens) {

            if (hashtagsOnly && !token.startsWith("#")) {
                continue;
            }

            token = token.replaceAll("\n", "");
            //token = token.replaceAll("\"", "''");

            if (token.endsWith(".")) {
                token = token.substring(0, token.length() - 1);
            }

            if (removeHashes && token.startsWith("#")) {
                token = token.substring(1);
            }

            if (removeAts && token.startsWith("@")) {
                token = token.substring(1);
            }

            if (removePunctuation) {
                Matcher m = punctuationPattern.matcher(token);
                String t = m.replaceAll("");
                if (t.length() == 0) {
                    continue;
                }
                token = t.replaceAll("[\\\\,-/]", " ").replaceAll("\\s{2,}", " ").trim();
            }

            if (stopwords.isEmpty() || !stopwords.contains(lowercase ? token : token.toLowerCase())) {
                out.add(token);
            }
        }

        return out;
    }

    public void setLowercase(boolean lowercase) {
        this.lowercase = lowercase;
    }

    public static class Options extends Params {

        // globals
        private final Param<PathParms> pathParams = new Param<>();
        // local stuff
        public final Param<Boolean> lowercase = new Param<>();
        public final Param<File> stopwordFile = new Param<>();
        public final Param<Boolean> filterStopwords = new Param<>();
        public final Param<Boolean> filterStopwordsIncKeywords = new Param<>();
        public final Param<List<String>> keywords = new Param<>();
        public final Param<Boolean> preserveURLs = new Param<>();
        public final Param<Boolean> removeHashes = new Param<>();
        public final Param<Boolean> removeAts = new Param<>();
        public final Param<Boolean> removePunctuation = new Param<>();
        public final Param<Boolean> hashtagsOnly = new Param<>();

        {

            pathParams.defaultValue(Params.instance(PathParms.class));

            lowercase.defaultValue(false);
            lowercase.doc("work in lowercase mode");

            stopwordFile.defaultValue(new File("stopwords/www.twithawk.com-faq-stopwords.txt"));
            stopwordFile.doc("path to stopwords file (relative to data path)");

            filterStopwords.defaultValue(false);
            filterStopwords.doc("enable stop word filtering in tokeniser");

            filterStopwordsIncKeywords.defaultValue(false);
            filterStopwordsIncKeywords.doc("enable stop word filtering inc keywords in tokeniser");

            keywords.defaultValue(Collections.<String>emptyList());
            keywords.doc("List of keywords to track ");

            preserveURLs.defaultValue(false);
            preserveURLs.doc("preserve URL casing in tokenisation");

            removeHashes.defaultValue(false);
            removeHashes.doc("remove #'s");

            removeAts.defaultValue(false);
            removeAts.doc("remove @'s");

            removePunctuation.defaultValue(false);
            removePunctuation.doc("remove characters matching");

            hashtagsOnly.doc("only pay attention to hashtags");
            hashtagsOnly.defaultValue(false);
        }

        public File getStopwordFile() {
            return pathParams.get().getDataFile(stopwordFile.get());
        }
    }
}