hu.ppke.itk.nlpg.purepos.model.internal.HashSuffixGuesser.java Source code

Java tutorial

Introduction

Here is the source code for hu.ppke.itk.nlpg.purepos.model.internal.HashSuffixGuesser.java

Source

/*******************************************************************************
 * Copyright (c) 2012 Gyrgy Orosz, Attila Novk.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v3
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/
 * 
 * This file is part of PurePos.
 * 
 * PurePos is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * PurePos is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 * 
 * Contributors:
 *     Gyrgy Orosz - initial API and implementation
 ******************************************************************************/
package hu.ppke.itk.nlpg.purepos.model.internal;

import hu.ppke.itk.nlpg.purepos.model.ITagMapper;
import hu.ppke.itk.nlpg.purepos.model.SuffixGuesser;

import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.tuple.MutablePair;

/**
 * Suffix guesser implementation for String suffixes with a HashTable
 * representation. This representation may save some memory space but needs a
 * bit more time to calculate the probability for a word and a tag.
 * 
 * @author Gyrgy Orosz
 * 
 * @param <T>
 */
// TODO: is it worth to create an implementation which recalculates the
// probabilities for all the suffixes?
public class HashSuffixGuesser<T> extends SuffixGuesser<String, T> {
    /**
     * 
     */
    private static final long serialVersionUID = -8813089059654810794L;
    private final HashMap<String, MutablePair<HashMap<T, Integer>, Integer>> freqTable;
    private final double theta;
    private final double thetaPlusOne;
    // @SuppressWarnings("unused")
    // private final Map<T, Double> aprioriProbs;
    protected ITagMapper<T> mapper = null;
    protected String lemmaMapper = null;

    @Override
    public void setTagMapper(ITagMapper<T> mapper) {
        this.mapper = mapper;
    }

    // protected String lastWord = "";
    // protected T lastTag;
    // protected double lastLogProb;

    // protected Logger logger = Logger.getLogger(this.getClass());

    HashSuffixGuesser(HashMap<String, MutablePair<HashMap<T, Integer>, Integer>> freqTable,
            // Map<T, Double> aprioriProbs,
            double theta) {
        // this.aprioriProbs = aprioriProbs;
        this.freqTable = freqTable;
        this.theta = theta;
        this.thetaPlusOne = theta + 1;
    }

    @Override
    public Map<T, Double> getTagLogProbabilities(String word) {
        HashMap<T, Double> ret = new HashMap<T, Double>();
        // Set<T> tags = freqTable.get("").getLeft().keySet();
        // for (T tag : tags) {
        // ret.put(tag, getTagLogProbability(word, tag));
        // }
        // return ret;
        Map<T, Double> probs = getTagProbabilities(word);
        for (Map.Entry<T, Double> entry : probs.entrySet()) {
            ret.put(entry.getKey(), Math.log(entry.getValue()));
        }
        return ret;
    }

    @Override
    public Map<T, Double> getSmoothedTagLogProbabilities(String word) {
        HashMap<T, Double> ret = new HashMap<T, Double>();
        // Set<T> tags = freqTable.get("").getLeft().keySet();
        // for (T tag : tags) {
        // ret.put(tag, getTagLogProbability(word, tag));
        // }
        // return ret;
        Map<T, Double> probs = getTagProbabilities(word);
        for (Map.Entry<T, Double> entry : probs.entrySet()) {
            ret.put(entry.getKey(), Math.log(entry.getValue()));
        }
        return ret;
    }

    // protected Double smooth(Double val) {
    // return val;
    //
    // }

    public Map<T, Double> getTagProbabilities(String word) {
        Map<T, Double> mret = new HashMap<T, Double>();
        // Set<T> tags = freqTable.get("").getLeft().keySet();
        // for (T tag : tags) {
        // mret.put(tag, 0.0);
        // }
        for (int i = word.length(); i >= 0; --i) {
            String suff = word.substring(i);
            MutablePair<HashMap<T, Integer>, Integer> suffixValue = freqTable.get(suff);
            if (suffixValue != null) {
                Map<T, Integer> tagSufFreqs = suffixValue.getLeft();
                for (Map.Entry<T, Integer> entry : tagSufFreqs.entrySet()) {
                    T tag = entry.getKey();
                    Double tagSufFreqD = entry.getValue().doubleValue();
                    Double relFreq = 0.0;
                    Double ret = mret.get(tag);
                    if (ret == null)
                        ret = 0.0;

                    relFreq = tagSufFreqD / suffixValue.getRight();

                    mret.put(tag, (ret + (relFreq * theta)) / thetaPlusOne);

                    // logger.debug("accu(" + tag + ") = (prev(" + retP
                    // + ") + relfreq(" + relFreq + ") * theta(" + theta
                    // + "))/thetaPO(" + thetaPlusOne + ") =" + ret);

                }
            }
        }
        return mret;

    }

    @Override
    public double getTagLogProbability(String word, T tag) {
        // System.out.println(tag + "\t" + word);
        // if (word == lastWord && tag == lastTag) {
        // return lastLogProb;
        // } else {
        double logProb = Math.log(getTagProbability(word, tag));
        // lastWord = word;
        // lastTag = tag;
        // lastLogProb = logProb;
        return logProb;// - Math.log(aprioriProbs.get(tag));
        // }
    }

    @Override
    public double getTagProbability(String word, T tag) {
        if (mapper != null) {
            tag = mapper.map(tag);
        }
        // TODO: are you sure to calculate with the empty suffix as well?
        // (Brants does this, but how about Halcsy?)
        // return getTagProbTnT(word, word.length(), tag);

        Map<T, Double> ret = getTagProbabilities(word);
        Double val = ret.get(tag);
        if (val != null)
            return val;
        else
            return 0.0;
        // return getTagProbHunPOS(word, tag);

        // Double ret = 0.0;
        // ret = getTagProbBoosted(word, tag, 2);
        // if (ret == 0)
        // ret = getTagProbBoosted(word, tag, 1);
        // if (ret == 0)
        // ret = getTagProbBoosted(word, tag, 0);
        // return ret;
        // return getTagProbRevHunPOS(word, tag);
    }

    @Deprecated
    protected double getTagProbBoosted(String word, T tag, Integer offset) {
        Double ret = 0.0;
        for (int i = word.length() - offset; i >= 0; --i) {
            String suff = word.substring(i);
            MutablePair<HashMap<T, Integer>, Integer> suffixValue = freqTable.get(suff);
            if (suffixValue != null) {
                Integer tagSufFreq = suffixValue.getLeft().get(tag);
                Double relFreq = 0.0;
                if (tagSufFreq != null) {
                    Double tagSufFreqD = tagSufFreq.doubleValue();
                    relFreq = tagSufFreqD / suffixValue.getRight();

                    ret = (ret + (relFreq * theta)) / thetaPlusOne;
                    // logger.debug("accu(" + tag + ") = (prev(" + retP
                    // + ") + relfreq(" + relFreq + ") * theta(" + theta
                    // + "))/thetaPO(" + thetaPlusOne + ") =" + ret);
                } else {
                    break;
                }
            }
        }
        return ret;
    }

    protected double getTagProbHunPOS(String word, T tag) {
        Double ret = 0.0;
        for (int i = word.length(); i >= 0; --i) {
            String suff = word.substring(i);
            MutablePair<HashMap<T, Integer>, Integer> suffixValue = freqTable.get(suff);
            if (suffixValue != null) {
                Integer tagSufFreq = suffixValue.getLeft().get(tag);
                Double relFreq = 0.0;
                if (tagSufFreq != null) {
                    Double tagSufFreqD = tagSufFreq.doubleValue();
                    relFreq = tagSufFreqD / suffixValue.getRight();

                    ret = (ret + (relFreq * theta)) / thetaPlusOne;
                    // logger.debug("accu(" + tag + ") = (prev(" + retP
                    // + ") + relfreq(" + relFreq + ") * theta(" + theta
                    // + "))/thetaPO(" + thetaPlusOne + ") =" + ret);
                } else {
                    break;
                }
            }
        }
        return ret;
    }

    @Deprecated
    protected double getTagProbRevHunPOS(String word, T tag) {
        Double ret = 0.0;
        for (int i = 0; i <= word.length(); ++i) {
            String suff = word.substring(i);
            MutablePair<HashMap<T, Integer>, Integer> suffixValue = freqTable.get(suff);
            if (suffixValue != null) {
                Integer tagSufFreq = suffixValue.getLeft().get(tag);
                Double relFreq = 0.0;
                if (tagSufFreq != null) {
                    Double tagSufFreqD = tagSufFreq.doubleValue();
                    relFreq = tagSufFreqD / suffixValue.getRight();

                    ret = (ret + (relFreq * theta)) / thetaPlusOne;
                    // logger.debug("accu(" + tag + ") = (prev(" + retP
                    // + ") + relfreq(" + relFreq + ") * theta(" + theta
                    // + "))/thetaPO(" + thetaPlusOne + ") =" + ret);
                }
            }
        }
        return ret;
    }

    /**
     * Calculates the probability for a given suffix and tag.
     * 
     * @param word
     *            the word which has the suffix
     * @param index
     *            end position of the suffix (usually the length of the suffix)
     * @param tag
     *            POS tag
     * @return
     */
    @Deprecated
    protected double getTagProbTnT(String word, int index, T tag) {

        if (index >= 0 && freqTable.containsKey(word.substring(index))) {
            String suffix = word.substring(index);

            MutablePair<HashMap<T, Integer>, Integer> suffixValue = freqTable.get(suffix);
            Integer tagSufFreq = suffixValue.getLeft().get(tag);
            Double tagSufFreqD;
            int newindex = index - 1;
            if (tagSufFreq == null) {
                newindex = -1;
                tagSufFreqD = 0.0;
            } else {
                tagSufFreqD = tagSufFreq.doubleValue();
            }
            Double relFreq = tagSufFreqD / suffixValue.getRight();
            double nTagProb = getTagProbTnT(word, newindex, tag);

            return (theta * relFreq + nTagProb) / thetaPlusOne;
        } else
            return 0;
    }

    @Deprecated
    @Override
    public T getMaxProbabilityTag(String word) {
        return getMaxProbabilityTag(getTagLogProbabilities(word));
    }

    @Override
    public String toString() {
        return freqTable.toString();
    }

    @Override
    public ITagMapper<T> getMapper() {
        return mapper;
    }

}