Java tutorial
package uk.ac.susx.tag.method51.text; /* * #%L * Tokeniser.java - method51 - University of Sussex - 2,013 * %% * Copyright (C) 2013 - 2014 University of Sussex * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import cmu.arktweetnlp.Twokenize; import com.google.common.io.Resources; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.ac.susx.mlcl.lib.io.Files; import uk.ac.susx.tag.method51.core.params.Params; import uk.ac.susx.tag.method51.twitter.params.PathParms; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created with IntelliJ IDEA. * User: sw206 * Date: 13/06/2013 * Time: 13:45 * To change this template use File | Settings | File Templates. */ public class Tokeniser { private static final Pattern punctuationPattern = Pattern .compile("^[\\u0021-\\u002f\\u003a-\\u0040\\u005b-\\u0060\\u007b-\\u007e\\u2000-\\u206f]+$"); private static final Logger LOG = LoggerFactory.getLogger(Tokeniser.class); private static final Map<Document, List<String>> tokenisationCache = Collections .synchronizedMap(new WeakHashMap<Document, List<String>>()); private static final Object2IntMap<String> vocab = new Object2IntOpenHashMap<>(); private static final Int2ObjectMap<String> bacov = new Int2ObjectOpenHashMap<>(); private final boolean removeHashes; private final boolean removeAts; private final boolean removePunctuation; private final boolean preserveURLs; private final Set<String> stopwords; private boolean lowercase; private final boolean hashtagsOnly; public Tokeniser(Options options) { this.removeAts = options.removeAts.get(); this.removePunctuation = options.removePunctuation.get(); this.removeHashes = options.removeHashes.get(); this.preserveURLs = options.preserveURLs.get(); hashtagsOnly = options.hashtagsOnly.get(); if (options.filterStopwords.get()) { try { File stopwordFile = options.stopwordFile.get(); String raw = Files .getText(new InputStreamReader( Resources.getResource(this.getClass(), stopwordFile.getPath()).openStream()), true) .toString(); if (lowercase) { raw = raw.toLowerCase(); } String[] words = raw.split("\n"); stopwords = new ObjectOpenHashSet<>(words); if (options.filterStopwordsIncKeywords.get()) { stopwords.addAll(options.keywords.get()); } } catch (IOException e) { LOG.error("could not open stop word list file!", e); throw new RuntimeException(e); } } else { stopwords = Collections.emptySet(); } } public static Map<Document, List<String>> getTokenisationCache() { return tokenisationCache; } public static String getString(int idx) { return bacov.get(idx); } public static int getIndex(String token) { if (vocab.containsKey(token)) { return vocab.getInt(token); } else { synchronized (vocab) { int idx = vocab.size(); vocab.put(token, idx); bacov.put(idx, token); return idx; } } } public static Int2ObjectMap<String> getBacov() { return bacov; } public static Object2IntMap<String> getVocab() { return vocab; } public List<String> tokenise(Document doc) { if (tokenisationCache.containsKey(doc)) { return tokenisationCache.get(doc); } else { String text = doc.getText(); return tokenise(text, null); } } public List<String> tokenise(String text) { return tokenise(text, null); } public List<String> tokenise(String text, String urls) { if (lowercase) { if (urls != null && preserveURLs && urls.length() > 0) { String[] bits = urls.split(" "); text = text.toLowerCase(); int indexBegin = 0; int indexEnd = 0; for (int i = 0; i < bits.length; ++i) { indexBegin = text.indexOf("http", indexBegin); indexEnd = text.indexOf(" ", indexBegin); indexEnd = indexEnd < 0 ? text.length() : indexEnd; text = text.substring(0, indexBegin) + bits[i] + text.substring(indexEnd); } } else { text = text.toLowerCase(); } } List<String> tokens = Twokenize.tokenizeRawTweetText(text); //String[] tokens = text.split(" "); List<String> out = new ArrayList<>(); for (String token : tokens) { if (hashtagsOnly && !token.startsWith("#")) { continue; } token = token.replaceAll("\n", ""); //token = token.replaceAll("\"", "''"); if (token.endsWith(".")) { token = token.substring(0, token.length() - 1); } if (removeHashes && token.startsWith("#")) { token = token.substring(1); } if (removeAts && token.startsWith("@")) { token = token.substring(1); } if (removePunctuation) { Matcher m = punctuationPattern.matcher(token); String t = m.replaceAll(""); if (t.length() == 0) { continue; } token = t.replaceAll("[\\\\,-/]", " ").replaceAll("\\s{2,}", " ").trim(); } if (stopwords.isEmpty() || !stopwords.contains(lowercase ? token : token.toLowerCase())) { out.add(token); } } return out; } public void setLowercase(boolean lowercase) { this.lowercase = lowercase; } public static class Options extends Params { // globals private final Param<PathParms> pathParams = new Param<>(); // local stuff public final Param<Boolean> lowercase = new Param<>(); public final Param<File> stopwordFile = new Param<>(); public final Param<Boolean> filterStopwords = new Param<>(); public final Param<Boolean> filterStopwordsIncKeywords = new Param<>(); public final Param<List<String>> keywords = new Param<>(); public final Param<Boolean> preserveURLs = new Param<>(); public final Param<Boolean> removeHashes = new Param<>(); public final Param<Boolean> removeAts = new Param<>(); public final Param<Boolean> removePunctuation = new Param<>(); public final Param<Boolean> hashtagsOnly = new Param<>(); { pathParams.defaultValue(Params.instance(PathParms.class)); lowercase.defaultValue(false); lowercase.doc("work in lowercase mode"); stopwordFile.defaultValue(new File("stopwords/www.twithawk.com-faq-stopwords.txt")); stopwordFile.doc("path to stopwords file (relative to data path)"); filterStopwords.defaultValue(false); filterStopwords.doc("enable stop word filtering in tokeniser"); filterStopwordsIncKeywords.defaultValue(false); filterStopwordsIncKeywords.doc("enable stop word filtering inc keywords in tokeniser"); keywords.defaultValue(Collections.<String>emptyList()); keywords.doc("List of keywords to track "); preserveURLs.defaultValue(false); preserveURLs.doc("preserve URL casing in tokenisation"); removeHashes.defaultValue(false); removeHashes.doc("remove #'s"); removeAts.defaultValue(false); removeAts.doc("remove @'s"); removePunctuation.defaultValue(false); removePunctuation.doc("remove characters matching"); hashtagsOnly.doc("only pay attention to hashtags"); hashtagsOnly.defaultValue(false); } public File getStopwordFile() { return pathParams.get().getDataFile(stopwordFile.get()); } } }