Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.opennlp.tools; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.lemmatizer.LemmatizerModel; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.postag.POSModel; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.TokenizerModel; import org.apache.lucene.analysis.util.ResourceLoader; /** * Supply OpenNLP Named Entity Recognizer * Cache model file objects. Assumes model files are thread-safe. */ public class OpenNLPOpsFactory { private static Map<String, SentenceModel> sentenceModels = new ConcurrentHashMap<>(); private static ConcurrentHashMap<String, TokenizerModel> tokenizerModels = new ConcurrentHashMap<>(); private static ConcurrentHashMap<String, POSModel> posTaggerModels = new ConcurrentHashMap<>(); private static ConcurrentHashMap<String, ChunkerModel> chunkerModels = new ConcurrentHashMap<>(); private static Map<String, TokenNameFinderModel> nerModels = new ConcurrentHashMap<>(); private static Map<String, LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>(); private static Map<String, String> lemmaDictionaries = new ConcurrentHashMap<>(); public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException { if (modelName != null) { SentenceModel model = sentenceModels.get(modelName); return new NLPSentenceDetectorOp(model); } else { return new NLPSentenceDetectorOp(); } } public static SentenceModel getSentenceModel(String modelName, ResourceLoader loader) throws IOException { SentenceModel model = sentenceModels.get(modelName); if (model == null) { try (InputStream resource = loader.openResource(modelName)) { model = new SentenceModel(resource); } sentenceModels.put(modelName, model); } return model; } public static NLPTokenizerOp getTokenizer(String modelName) throws IOException { if (modelName == null) { return new NLPTokenizerOp(); } else { TokenizerModel model = tokenizerModels.get(modelName); return new NLPTokenizerOp(model); } } public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader loader) throws IOException { TokenizerModel model = tokenizerModels.get(modelName); if (model == null) { try (InputStream resource = loader.openResource(modelName)) { model = new TokenizerModel(resource); } tokenizerModels.put(modelName, model); } return model; } public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException { POSModel model = posTaggerModels.get(modelName); return new NLPPOSTaggerOp(model); } public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader) throws IOException { POSModel model = posTaggerModels.get(modelName); if (model == null) { try (InputStream resource = loader.openResource(modelName)) { model = new POSModel(resource); } posTaggerModels.put(modelName, model); } return model; } public static NLPChunkerOp getChunker(String modelName) throws IOException { ChunkerModel model = chunkerModels.get(modelName); return new NLPChunkerOp(model); } public static ChunkerModel getChunkerModel(String modelName, ResourceLoader loader) throws IOException { ChunkerModel model = chunkerModels.get(modelName); if (model == null) { try (InputStream resource = loader.openResource(modelName)) { model = new ChunkerModel(resource); } chunkerModels.put(modelName, model); } return model; } public static NLPNERTaggerOp getNERTagger(String modelName) throws IOException { TokenNameFinderModel model = nerModels.get(modelName); return new NLPNERTaggerOp(model); } public static TokenNameFinderModel getNERTaggerModel(String modelName, ResourceLoader loader) throws IOException { TokenNameFinderModel model = nerModels.get(modelName); if (model == null) { try (InputStream resource = loader.openResource(modelName)) { model = new TokenNameFinderModel(resource); } nerModels.put(modelName, model); } return model; } public static NLPLemmatizerOp getLemmatizer(String dictionaryFile, String lemmatizerModelFile) throws IOException { assert dictionaryFile != null || lemmatizerModelFile != null : "At least one parameter must be non-null"; InputStream dictionaryInputStream = null; if (dictionaryFile != null) { String dictionary = lemmaDictionaries.get(dictionaryFile); dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8)); } LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile); return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel); } public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) throws IOException { String dictionary = lemmaDictionaries.get(dictionaryFile); if (dictionary == null) { try (Reader reader = new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8)) { StringBuilder builder = new StringBuilder(); char[] chars = new char[8092]; int numRead = 0; do { numRead = reader.read(chars, 0, chars.length); if (numRead > 0) { builder.append(chars, 0, numRead); } } while (numRead > 0); dictionary = builder.toString(); lemmaDictionaries.put(dictionaryFile, dictionary); } } return dictionary; } public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader) throws IOException { LemmatizerModel model = lemmatizerModels.get(modelName); if (model == null) { try (InputStream resource = loader.openResource(modelName)) { model = new LemmatizerModel(resource); } lemmatizerModels.put(modelName, model); } return model; } // keeps unit test from blowing out memory public static void clearModels() { sentenceModels.clear(); tokenizerModels.clear(); posTaggerModels.clear(); chunkerModels.clear(); nerModels.clear(); lemmaDictionaries.clear(); } }