Java tutorial
package edu.stanford.nlp.pipeline; import java.io.Reader; import java.io.StringReader; import java.util.*; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.*; import edu.stanford.nlp.international.spanish.process.SpanishTokenizer; import edu.stanford.nlp.international.french.process.FrenchTokenizer; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.ReflectionLoading; import edu.stanford.nlp.util.logging.Redwood; /** * This class will PTB tokenize the input. It assumes that the original * String is under the CoreAnnotations.TextAnnotation field * and it will add the output from the * InvertiblePTBTokenizer ({@code List<CoreLabel>}) under * CoreAnnotation.TokensAnnotation. * * @author Jenny Finkel * @author Christopher Manning * @author Ishita Prasad */ public class TokenizerAnnotator implements Annotator { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(TokenizerAnnotator.class); /** * Enum to identify the different TokenizerTypes. To add a new * TokenizerType, add it to the list with a default options string * and add a clause in getTokenizerType to identify it. */ public enum TokenizerType { Unspecified(null, null, "invertible,ptb3Escaping=true"), Arabic("ar", null, ""), Chinese("zh", null, ""), Spanish("es", "SpanishTokenizer", "invertible,splitAll=false"), English("en", "PTBTokenizer", "invertible"), German("de", null, "invertible,ptb3Escaping=false,splitHyphenated=true"), French("fr", "FrenchTokenizer", "invertible,splitCompounds=false,splitContractions=false,quotes=ORIGINAL"), Whitespace( null, "WhitespaceTokenizer", ""); private final String abbreviation; private final String className; private final String defaultOptions; TokenizerType(String abbreviation, String className, String defaultOptions) { this.abbreviation = abbreviation; this.className = className; this.defaultOptions = defaultOptions; } public String getDefaultOptions() { return defaultOptions; } private static final Map<String, TokenizerType> nameToTokenizerMap = initializeNameMap(); private static Map<String, TokenizerType> initializeNameMap() { Map<String, TokenizerType> map = Generics.newHashMap(); for (TokenizerType type : TokenizerType.values()) { if (type.abbreviation != null) { map.put(type.abbreviation.toUpperCase(), type); } map.put(type.toString().toUpperCase(), type); } return Collections.unmodifiableMap(map); } private static final Map<String, TokenizerType> classToTokenizerMap = initializeClassMap(); private static Map<String, TokenizerType> initializeClassMap() { Map<String, TokenizerType> map = Generics.newHashMap(); for (TokenizerType type : TokenizerType.values()) { if (type.className != null) { map.put(type.className.toUpperCase(), type); } } return Collections.unmodifiableMap(map); } /** * Get TokenizerType based on what's in the properties. * * @param props Properties to find tokenizer options in * @return An element of the TokenizerType enum indicating the tokenizer to use */ public static TokenizerType getTokenizerType(Properties props) { String tokClass = props.getProperty("tokenize.class", null); boolean whitespace = Boolean.parseBoolean(props.getProperty("tokenize.whitespace", "false")); String language = props.getProperty("tokenize.language", "en"); if (whitespace) { return Whitespace; } if (tokClass != null) { TokenizerType type = classToTokenizerMap.get(tokClass.toUpperCase()); if (type == null) { throw new IllegalArgumentException( "TokenizerAnnotator: unknown tokenize.class property " + tokClass); } return type; } if (language != null) { TokenizerType type = nameToTokenizerMap.get(language.toUpperCase()); if (type == null) { throw new IllegalArgumentException( "TokenizerAnnotator: unknown tokenize.language property " + language); } return type; } return Unspecified; } } // end enum TokenizerType @SuppressWarnings("WeakerAccess") public static final String EOL_PROPERTY = "tokenize.keepeol"; @SuppressWarnings("WeakerAccess") public static final String KEEP_NL_OPTION = "tokenizeNLs,"; private final boolean VERBOSE; private final TokenizerFactory<CoreLabel> factory; /** new segmenter properties **/ private final boolean useSegmenter; private final Annotator segmenterAnnotator; /** run a custom post processor after the lexer **/ private final boolean usePostProcessor; private final CoreLabelProcessor postProcessor; // CONSTRUCTORS /** Gives a non-verbose, English tokenizer. */ public TokenizerAnnotator() { this(false); } private static String computeExtraOptions(Properties properties) { String extraOptions = null; boolean keepNewline = Boolean .parseBoolean(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false")); // ssplit.eolonly // Only possibly put in *NL* if not never (the Boolean method treats null as false) // We used to also check for ssplit annotator being present, but // that was wrong in the case where a tokenizer model was // preloaded (such as in the case of segmenters) and we didn't // want to need to reload the model when the ssplit was later added. if (!Boolean.parseBoolean(properties.getProperty("ssplit.isOneSentence"))) { // Set to { NEVER, ALWAYS, TWO_CONSECUTIVE } based on ssplit.newlineIsSentenceBreak String nlsbString = properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, StanfordCoreNLP.DEFAULT_NEWLINE_IS_SENTENCE_BREAK); WordToSentenceProcessor.NewlineIsSentenceBreak nlsb = WordToSentenceProcessor .stringToNewlineIsSentenceBreak(nlsbString); if (nlsb != WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER) { keepNewline = true; } } if (keepNewline) { extraOptions = KEEP_NL_OPTION; } return extraOptions; } public TokenizerAnnotator(Properties properties) { this(false, properties, computeExtraOptions(properties)); } public TokenizerAnnotator(boolean verbose) { this(verbose, TokenizerType.English); } public TokenizerAnnotator(String lang) { this(true, lang, null); } public TokenizerAnnotator(boolean verbose, TokenizerType lang) { this(verbose, lang.toString()); } public TokenizerAnnotator(boolean verbose, String lang) { this(verbose, lang, null); } public TokenizerAnnotator(boolean verbose, String lang, String options) { this(verbose, lang == null ? null : PropertiesUtils.asProperties("tokenize.language", lang), options); } public TokenizerAnnotator(boolean verbose, Properties props) { this(verbose, props, null); } public TokenizerAnnotator(boolean verbose, Properties props, String options) { if (props == null) { props = new Properties(); } // check if segmenting must be done (Chinese or Arabic and not tokenizing on whitespace) boolean whitespace = Boolean.parseBoolean(props.getProperty("tokenize.whitespace", "false")); if (props.getProperty("tokenize.language") != null && LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language")) && !whitespace) { useSegmenter = true; if (LanguageInfo.getLanguageFromString( props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC) segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props); else if (LanguageInfo.getLanguageFromString( props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.CHINESE) segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props); else { segmenterAnnotator = null; throw new RuntimeException("No segmenter implemented for: " + LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language"))); } } else { useSegmenter = false; segmenterAnnotator = null; } // load any custom token post processing String postProcessorClass = props.getProperty("tokenize.postProcessor", ""); try { if (!postProcessorClass.equals("")) { postProcessor = ReflectionLoading.loadByReflection(postProcessorClass); usePostProcessor = true; } else { postProcessor = null; usePostProcessor = false; } } catch (Exception e) { throw new RuntimeException("Loading: " + postProcessorClass + " failed with: " + e.getMessage()); } VERBOSE = PropertiesUtils.getBool(props, "tokenize.verbose", verbose); TokenizerType type = TokenizerType.getTokenizerType(props); factory = initFactory(type, props, options); if (VERBOSE) { log.info("Initialized tokenizer factory: " + factory); } } /** * initFactory returns the right type of TokenizerFactory based on the options in the properties file * and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve * your tokenizer from the properties file, and then add a class is the switch structure here to * instantiate the new Tokenizer type. * * @param type the TokenizerType * @param props the properties file * @param extraOptions extra things that should be passed into the tokenizer constructor */ private static TokenizerFactory<CoreLabel> initFactory(TokenizerType type, Properties props, String extraOptions) throws IllegalArgumentException { TokenizerFactory<CoreLabel> factory; String options = props.getProperty("tokenize.options", null); // set it to the equivalent of both extraOptions and options // TODO: maybe we should always have getDefaultOptions() and // expect the user to turn off default options. That would // require all options to have negated options, but // currently there are some which don't have that if (options == null) { options = type.getDefaultOptions(); } if (extraOptions != null) { if (extraOptions.endsWith(",")) { options = extraOptions + options; } else { options = extraOptions + ',' + options; } } switch (type) { case Arabic: case Chinese: factory = null; break; case Spanish: factory = SpanishTokenizer.factory(new CoreLabelTokenFactory(), options); break; case French: factory = FrenchTokenizer.factory(new CoreLabelTokenFactory(), options); break; case Whitespace: boolean eolIsSignificant = Boolean.parseBoolean(props.getProperty(EOL_PROPERTY, "false")); eolIsSignificant = eolIsSignificant || KEEP_NL_OPTION.equals(computeExtraOptions(props)); factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), eolIsSignificant); break; case English: case German: factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options); break; case Unspecified: log.info("No tokenizer type provided. Defaulting to PTBTokenizer."); factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options); break; default: throw new IllegalArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer."); } return factory; } /** * Returns a thread-safe tokenizer */ public Tokenizer<CoreLabel> getTokenizer(Reader r) { return factory.getTokenizer(r); } /** * Helper method to set the TokenBeginAnnotation and TokenEndAnnotation of every token. */ private static void setTokenBeginTokenEnd(List<CoreLabel> tokensList) { int tokenIndex = 0; for (CoreLabel token : tokensList) { token.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex); token.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex + 1); tokenIndex++; } } /** * set isNewline() */ private static void setNewlineStatus(List<CoreLabel> tokensList) { // label newlines for (CoreLabel token : tokensList) { if (token.word().equals(AbstractTokenizer.NEWLINE_TOKEN) && (token.endPosition() - token.beginPosition() == 1)) token.set(CoreAnnotations.IsNewlineAnnotation.class, true); else token.set(CoreAnnotations.IsNewlineAnnotation.class, false); } } /** * Does the actual work of splitting TextAnnotation into CoreLabels, * which are then attached to the TokensAnnotation. */ @Override public void annotate(Annotation annotation) { if (VERBOSE) { log.info("Beginning tokenization"); } // for Arabic and Chinese use a segmenter instead if (useSegmenter) { segmenterAnnotator.annotate(annotation); // set indexes into document wide tokens list setTokenBeginTokenEnd(annotation.get(CoreAnnotations.TokensAnnotation.class)); setNewlineStatus(annotation.get(CoreAnnotations.TokensAnnotation.class)); return; } if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) { String text = annotation.get(CoreAnnotations.TextAnnotation.class); Reader r = new StringReader(text); // don't wrap in BufferedReader. It gives you nothing for in-memory String unless you need the readLine() method! List<CoreLabel> tokens = getTokenizer(r).tokenize(); // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory // for (CoreLabel token: tokens) { // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class)); // } // label newlines setNewlineStatus(tokens); // set indexes into document wide token list setTokenBeginTokenEnd(tokens); // run post processing if (usePostProcessor) tokens = postProcessor.process(tokens); // add tokens list to annotation annotation.set(CoreAnnotations.TokensAnnotation.class, tokens); if (VERBOSE) { log.info("Tokenized: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); } } else { throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation); } } @Override public Set<Class<? extends CoreAnnotation>> requires() { return Collections.emptySet(); } @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { return new HashSet<>(Arrays.asList(CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class, CoreAnnotations.IsNewlineAnnotation.class)); } }