Java tutorial
/** ======================================================================== * handytrowel: src/main/java/nlp/CommonWordAnnotator.java * Stip out stemmed versions of the 10,000 most common English words. * ======================================================================== * Copyright (c) 2014, Asim Ihsan, All rights reserved. * <http://www.asimihsan.com> * https://github.com/asimihsan/handytrowel/blob/master/LICENSE * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * ======================================================================== */ /** * Inspired by: https://github.com/jconwell/coreNlp/blob/master/src/main/java/intoxicant/analytics/coreNlp/StopwordAnnotator.java */ package com.asimihsan.handytrowel.nlp; import java.util.Collections; import java.util.List; import java.util.Properties; import java.util.Set; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.pipeline.Annotator; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.util.Version; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.util.Pair; /** * User: jconwell * CoreNlp Annotator that checks if in coming token is a stopword */ public class StopwordAnnotator implements Annotator, CoreAnnotation<Pair<Boolean, Boolean>> { /** * stopword annotator class name used in annotators property */ public static final String ANNOTATOR_CLASS = "stopword"; public static final String STANFORD_STOPWORD = ANNOTATOR_CLASS; public static final Requirement STOPWORD_REQUIREMENT = new Requirement(STANFORD_STOPWORD); /** * Property key to specify the comma delimited list of custom stopwords */ public static final String STOPWORDS_LIST = "stopword-list"; /** * Property key to specify if stopword list is case insensitive */ public static final String IGNORE_STOPWORD_CASE = "ignore-stopword-case"; /** * Property key to specify of StopwordAnnotator should check word lemma as stopword */ public static final String CHECK_LEMMA = "check-lemma"; private static Class<? extends Pair> boolPair = Pair.makePair(true, true).getClass(); private Properties props; private CharArraySet stopwords; private boolean checkLemma; public StopwordAnnotator(String annotatorClass, Properties props) { this.props = props; this.checkLemma = Boolean.parseBoolean(props.getProperty(CHECK_LEMMA, "false")); if (this.props.containsKey(STOPWORDS_LIST)) { String stopwordList = props.getProperty(STOPWORDS_LIST); boolean ignoreCase = Boolean.parseBoolean(props.getProperty(IGNORE_STOPWORD_CASE, "false")); this.stopwords = getStopWordList(Version.LUCENE_36, stopwordList, ignoreCase); } else { this.stopwords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET; } } @Override public void annotate(Annotation annotation) { if (stopwords != null && stopwords.size() > 0 && annotation.containsKey(TokensAnnotation.class)) { List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); for (CoreLabel token : tokens) { boolean isWordStopword = stopwords.contains(token.word().toLowerCase()); boolean isLemmaStopword = checkLemma ? stopwords.contains(token.lemma().toLowerCase()) : false; Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword, isLemmaStopword); token.set(StopwordAnnotator.class, pair); } } } @Override public Set<Requirement> requirementsSatisfied() { return Collections.singleton(STOPWORD_REQUIREMENT); } @Override public Set<Requirement> requires() { if (checkLemma) { return TOKENIZE_SSPLIT_POS_LEMMA; } else { return TOKENIZE_AND_SSPLIT; } } @Override @SuppressWarnings("unchecked") public Class<Pair<Boolean, Boolean>> getType() { return (Class<Pair<Boolean, Boolean>>) boolPair; } public static CharArraySet getStopWordList(Version luceneVersion, String stopwordList, boolean ignoreCase) { String[] terms = stopwordList.split(","); CharArraySet stopwordSet = new CharArraySet(luceneVersion, terms.length, ignoreCase); for (String term : terms) { stopwordSet.add(term); } return CharArraySet.unmodifiableSet(stopwordSet); } }