de.tudarmstadt.ukp.dkpro.core.lingpipe.LingPipeNamedEntityRecognizer.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.lingpipe.LingPipeNamedEntityRecognizer.java
Source

/**
 * Copyright 2007-2014
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package de.tudarmstadt.ukp.dkpro.core.lingpipe;

import static java.util.Arrays.asList;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.toText;
import static org.apache.uima.util.Level.INFO;

import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.reflect.FieldUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import com.aliasi.chunk.AbstractCharLmRescoringChunker;
import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.HmmChunker;
import com.aliasi.chunk.TokenShapeChunker;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.symbol.SymbolTable;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

/**
 * LingPipe named entity recognizer.
 */
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = {
        "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" })
public class LingPipeNamedEntityRecognizer extends JCasAnnotator_ImplBase {
    /**
     * Log the tag set(s) when a model is loaded.
     */
    public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
    @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false")
    protected boolean printTagSet;

    /**
     * Use this language instead of the document language to resolve the model.
     */
    public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
    @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
    protected String language;

    /**
     * Variant of a model the model. Used to address a specific model if here are multiple models
     * for one language.
     */
    public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
    @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
    protected String variant;

    /**
     * Location from which the model is read.
     */
    public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
    @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
    protected String modelLocation;

    /**
     * Location of the mapping file for named entity tags to UIMA types.
     */
    public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION;
    @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false)
    protected String mappingLocation;

    private ModelProviderBase<Chunker> modelProvider;
    private MappingProvider mappingProvider;

    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);

        modelProvider = new ModelProviderBase<Chunker>(this, "lingpipe", "ner") {
            @Override
            protected Chunker produceResource(InputStream aStream) throws Exception {
                ObjectInputStream ois = new ObjectInputStream(aStream);
                Chunker chunker = (Chunker) ois.readObject();

                System.out.println(chunker.getClass());

                SingletonTagset tags = new SingletonTagset(NamedEntity.class, null);
                if (chunker instanceof HmmChunker) {
                    HiddenMarkovModel hmm = ((HmmChunker) chunker).getDecoder().getHmm();

                    List<String> prefixes = asList("B_", "M_", "E_", "W_", "BB_O_", "EE_O_", "WW_O_");

                    for (int n = 0; n < hmm.stateSymbolTable().numSymbols(); n++) {
                        String tag = hmm.stateSymbolTable().idToSymbol(n);

                        if (prefixes.contains(StringUtils.substring(tag, 0, 5))) {
                            tag = tag.substring(5);
                        } else if (prefixes.contains(StringUtils.substring(tag, 0, 2))) {
                            tag = tag.substring(2);
                        }
                        if ("BOS".equals(tag) || "MM_O".equals(tag)) {
                            // BOS is reserved by the system
                            continue;
                        }

                        tags.add(tag);
                    }
                } else if (chunker instanceof TokenShapeChunker) {
                    Object decoder = FieldUtils.readField(chunker, "mDecoder", true);
                    Object estimator = FieldUtils.readField(decoder, "mEstimator", true);
                    SymbolTable tagTable = (SymbolTable) FieldUtils.readField(estimator, "mTagSymbolTable", true);
                    for (int n = 0; n < tagTable.numSymbols(); n++) {
                        String tag = tagTable.idToSymbol(n);
                        // Handle BIO encoding
                        if (tag.startsWith("B-") || tag.startsWith("I-")) {
                            tag = tag.substring(2);
                        }
                        if ("O".equals(tag)) {
                            continue;
                        }
                        tags.add(tag);
                    }
                } else if (chunker instanceof AbstractCharLmRescoringChunker) {
                    @SuppressWarnings("unchecked")
                    Map<String, Character> typeToChar = (Map<String, Character>) FieldUtils.readField(chunker,
                            "mTypeToChar", true);
                    for (String tag : typeToChar.keySet()) {
                        tags.add(tag);
                    }
                }
                addTagset(tags);

                if (printTagSet) {
                    getContext().getLogger().log(INFO, tags.toString());
                }

                return chunker;
            }
        };

        mappingProvider = new MappingProvider();
        mappingProvider
                .setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-default-variants.map");
        mappingProvider.setDefault(MappingProvider.LOCATION,
                "classpath:/de/tudarmstadt/ukp/dkpro/" + "core/lingpipe/lib/ner-${language}-${variant}.map");
        mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName());
        mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation);
        mappingProvider.setOverride(MappingProvider.LANGUAGE, language);
        mappingProvider.setOverride(MappingProvider.VARIANT, variant);
    }

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        CAS cas = aJCas.getCas();
        modelProvider.configure(cas);
        mappingProvider.configure(cas);

        // get the document text
        List<Token> tokenList = new ArrayList<Token>(select(aJCas, Token.class));
        String[] tokens = toText(tokenList).toArray(new String[tokenList.size()]);

        Chunking chunking = modelProvider.getResource().chunk(cas.getDocumentText());

        // get the named entities and their character offsets
        for (Chunk namedEntity : chunking.chunkSet()) {
            Type type = mappingProvider.getTagType(namedEntity.type());
            NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, namedEntity.start(), namedEntity.end());
            neAnno.setValue(namedEntity.type());
            neAnno.addToIndexes();
        }
    }
}