de.linguatools.disco.Compositionality.java Source code

Introduction

Here is the source code for de.linguatools.disco.Compositionality.java
Source

/*******************************************************************************
 *   Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013 Peter Kolb
 *   peter.kolb@linguatools.org
 *
 *   Licensed under the Apache License, Version 2.0 (the "License"); you may not
 *   use this file except in compliance with the License. You may obtain a copy
 *   of the License at 
 *   
 *        http://www.apache.org/licenses/LICENSE-2.0 
 *
 *   Unless required by applicable law or agreed to in writing, software 
 *   distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *   WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 *   License for the specific language governing permissions and limitations
 *   under the License.
 *
 ******************************************************************************/

package de.linguatools.disco;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;

/**
 * This class contains support for compositional distributional semantics.
 * There are methods to compute the similarity between multi-word terms,
 * phrases and sentences or even paragraphs based on composition of the vectors
 * of individual words.
 * @author peter
 * @version 1.4
 */
public class Compositionality {

    /**
     * Implemented methods of vector composition.
     */
    public enum VectorCompositionMethod {
        /**
         * Simple vector addition.
         */
        ADDITION,
        /**
         * Entry-wise multiplication.
         */
        MULTIPLICATION,
        /**
         * Parameterized combination of addition and multiplication, see 
         * equation (11) in J. Mitchell and M. Lapata: 
         * <a href="http://aclweb.org/anthology-new/P/P08/P08-1028.pdf">Vector-based Models of
         * Semantic Composition</a>. Proceedings of ACL-08: HLT.<br/>
         * Compose vectors wv1 and wv2 by a combination of addition and 
         * multiplication:
         * <blockquote>p = alpha*wv1 + beta*wv2 + gamma*wv1*wv2</blockquote>
         * The contribution of multiplication and addition, as well
         * as the contribution of each of the two vectors can be controlled by the
         * three parameters alpha, beta and gamma.<br/>
         * For instance, in Mitchell and Lapata 2008 where wv1 is a verb and wv2 is
         * a noun, the parameters alpha, beta and gamma are set as follows:
         * <blockquote>alpha = 0.95<br/>
         * beta = 0<br/>
         * gamma = 0.05.</blockquote>
         * If one of alpha, beta, gamma is null, then these default values are used.
         */
        COMBINED,
        /**
         * Dilate word vector u along the direction of word vector v: 
         *  <blockquote>v' = u  v<br/>
         *     = (u*u)v + (lambda-1)(u*v)u</blockquote>
         * where * is the dot product (Skalarprodukt).</br>
         * The default value (if lambda is null) for lambda is 2.0.<br/>
         * Contrary to the other composition methods, this operation is not
         * symmetric.<br/>
         * See chapter 4 of J. Mitchell: Composition in Distributional Models of
         * Semantics. PhD, Edinburgh, 2011.
         */
        DILATION;
    }

    /**
     * Available measures for vector comparison. 
     */
    public enum SimilarityMeasures {
        /**
         * The well-known cosine similarity measure.
         */
        COSINE,
        /**
         * The similarity measure described in the paper <a 
         * href="http://hdl.handle.net/10062/9731">Experiments on the difference
         * between semantic similarity and relatedness</a>. This is DISCO's
         * default measure of semantic similarity.
         */
        KOLB
    }

    /***************************************************************************
     * Returns the collocations with their exact positions and their significance
     * values -- in other words the word vector representing the input word.
     * @param word input word (a single token, must not contain white space)
     * @return HashMap containing word vector or null
     * @throws java.io.IOException
     */
    private HashMap<String, Float> getWordvector(String word, DISCO disco) throws IOException {

        Document doc = disco.searchIndex(word);
        if (doc == null)
            return null;
        // die komprimiert gespeicherten Inhalte der Felder "Kol1" und "KolSig1"
        // bis "Kol6" und "KolSig6" holen
        HashMap<String, Float> result = new HashMap();
        String[] wordsBuffer;
        String[] valuesBuffer;
        for (int rel = 1; rel <= 6; rel++) {
            wordsBuffer = doc.get("kol" + Integer.toString(rel)).split(" ");
            valuesBuffer = doc.get("kol" + Integer.toString(rel) + "Sig").split(" ");
            // BUG: TODO: im Index steht am Anfang der kol-Felder ein Leerzeichen!
            for (int i = 1; i < wordsBuffer.length; i++) {
                // speichere Wort+Relation --> Wert
                result.put(wordsBuffer[i] + Integer.toString(rel), Float.parseFloat(valuesBuffer[i]));
            }
        }
        return result;
    }

    /**
     * Compute the dot product (inner product, scalar product) of wv1 and wv2.
     * @param wv1 first word vector
     * @param wv2 second word vector
     * @return result (a scalar, not a vector)
     */
    private float computeDotProduct(HashMap<String, Float> wv1, HashMap<String, Float> wv2) {

        float sp = 0.0F;

        for (Iterator it = wv1.keySet().iterator(); it.hasNext();) {
            String w = (String) it.next();
            if (wv2.containsKey(w)) {
                sp = sp + wv1.get(w) * wv2.get(w);
            }
        }
        return sp;
    }

    /**
     * The following formula is used:
     * <blockquote>(wv1*wv1)wv2 + (lambda-1)(wv1*wv2)wv1</blockquote>
     * The default value (if lambda is null) for lambda is 2.0.<br/>
     * This composition method only works with the SimilarityMeasures.COSINE
     * similarity measure. 
     * @param wv1
     * @param wv2
     * @param lambda
     * @return 
     */
    private HashMap<String, Float> composeVectorsByDilation(HashMap<String, Float> wv1, HashMap<String, Float> wv2,
            Float lambda) {

        if (lambda == null)
            lambda = 2.0F;

        float a = computeDotProduct(wv1, wv2);
        HashMap<String, Float> f1 = multiplicateWordVectorWithScalar(wv2, a);
        HashMap<String, Float> f2 = multiplicateWordVectorWithScalar(wv1, a * (lambda - 1));
        return composeVectorsByAddition(f1, f2);
    }

    /**
     * Multiply all values in the word vector hash with the scalar. 
     * @param wv word vector
     * @param scalar
     * @return 
     */
    private HashMap<String, Float> multiplicateWordVectorWithScalar(HashMap<String, Float> wv, float scalar) {

        for (Iterator it = wv.keySet().iterator(); it.hasNext();) {
            String w = (String) it.next();
            wv.put(w, wv.get(w) * scalar);
        }
        return wv;
    }

    /**
     * Compose vectors wv1 and wv2 by a combination of addition and 
     * multiplication:
     * <blockquote>p = alpha*wv1 + beta*wv2 + gamma*wv1*wv2</blockquote>
     * The contribution of multiplication and addition, as well
     * as the contribution of each of the two vectors can be controlled by the
     * three parameters alpha, beta and gamma.<br/>
     * For instance, in Mitchell and Lapata 2008 where wv1 is a verb and wv2 is
     * a noun, the parameters alpha, beta and gamma are set as follows:
     * <blockquote>alpha = 0.95<br/>
     * beta = 0<br/>
     * gamma = 0.05.</blockquote>
     * If one of alpha, beta, gamma is null, then these default values are used.
     * @param wv1 first word vector
     * @param wv2 second word vector
     * @param alpha weight of additive contribution of first word vector
     * @param beta weight of additive contribution of second word vector
     * @param gamma weight of multiplicative contribution of both word vectors
     * @return 
     */
    private HashMap<String, Float> composeVectorsByCombinedMultAdd(HashMap<String, Float> wv1,
            HashMap<String, Float> wv2, Float alpha, Float beta, Float gamma) {

        if (alpha == null || beta == null || gamma == null) {
            alpha = 0.95F;
            beta = 0.0F;
            gamma = 0.05F;
        }

        // Formula: result = a*wv1 + b*wv2 + c*wv1*wv2
        // m = wv1 * wv2
        HashMap<String, Float> m = composeVectorsByMultiplication(wv1, wv2);
        // m = c * m
        m = multiplicateWordVectorWithScalar(m, gamma);
        // k = a * wv1
        HashMap<String, Float> k = multiplicateWordVectorWithScalar(wv1, alpha);
        // l = b * wv2
        HashMap<String, Float> l = multiplicateWordVectorWithScalar(wv2, beta);
        // result = k + l + m
        return composeVectorsByAddition(composeVectorsByAddition(k, l), m);
    }

    /**
     * Combines two word vectors by multiplication.
     * @param wv1 word vector #1
     * @param wv2 word vector #2
     * @return the combined word vector
     */
    private HashMap<String, Float> composeVectorsByMultiplication(HashMap<String, Float> wv1,
            HashMap<String, Float> wv2) {

        HashMap<String, Float> result = new HashMap();

        for (Iterator it = wv1.keySet().iterator(); it.hasNext();) {
            String feature = (String) it.next();
            if (wv2.containsKey(feature)) {
                result.put(feature, wv1.get(feature) * wv2.get(feature));
            }
        }
        return result;
    }

    /**
     * Combines two word vectors by addition.
     * @param wv1
     * @param wv2
     * @return the combined word vector
     */
    private HashMap<String, Float> composeVectorsByAddition(HashMap<String, Float> wv1,
            HashMap<String, Float> wv2) {

        HashMap<String, Float> result = new HashMap();

        // copy those features from wv1 to result that do not occur in wv2
        for (Iterator it = wv1.keySet().iterator(); it.hasNext();) {
            String w = (String) it.next();
            if (!wv2.containsKey(w)) {
                result.put(w, wv1.get(w));
            }
        }

        // run through wv2 and add all common features of wv1 and wv2 to result,
        // and also those features from wv2 that do not occur in wv1
        for (Iterator it = wv2.keySet().iterator(); it.hasNext();) {
            String w = (String) it.next();
            if (wv1.containsKey(w)) {
                result.put(w, wv1.get(w) + wv2.get(w));
            } else {
                result.put(w, wv2.get(w));
            }
        }

        return result;
    }

    /**
     * Compose two word vectors by the composition method given in 
     * <code>compositionMethod</code>.
     * @param wv1 word vector #1
     * @param wv2 word vector #2
     * @param compositionMethod One of the methods in 
     * <code>VectorCompositionMethod</code>.
     * @return the resulting word vector or <code>null</code>.
     */
    public HashMap<String, Float> composeWordVectors(HashMap<String, Float> wv1, HashMap<String, Float> wv2,
            VectorCompositionMethod compositionMethod, Float alpha, Float beta, Float gamma, Float lambda) {

        if (compositionMethod == VectorCompositionMethod.ADDITION) {
            return composeVectorsByAddition(wv1, wv2);
        } else if (compositionMethod == VectorCompositionMethod.MULTIPLICATION) {
            return composeVectorsByMultiplication(wv1, wv2);
        } else if (compositionMethod == VectorCompositionMethod.COMBINED) {
            return composeVectorsByCombinedMultAdd(wv1, wv2, alpha, beta, gamma);
        } else if (compositionMethod == VectorCompositionMethod.DILATION) {
            return composeVectorsByDilation(wv1, wv2, lambda);
        } else {
            return null;
        }
    }

    /**
     * Compose two or more word vectors by the composition method given in 
     * <code>compositionMethod</code>.
     * @param wordvectorList a list of word vectors to be combined. The list has
     * to have at least two elements. The ordering of the list has no influence
     * on the result.
     * @param compositionMethod One of the methods in 
     * <code>VectorCompositionMethod</code>.
     * @return the resulting word vector or <code>null</code>.
     */
    public HashMap<String, Float> composeWordVectors(ArrayList<HashMap<String, Float>> wordvectorList,
            VectorCompositionMethod compositionMethod, Float alpha, Float beta, Float gamma, Float lambda) {

        if (wordvectorList.size() < 2)
            return null;

        // combine the first two vectors in the list
        HashMap<String, Float> wv = composeWordVectors(wordvectorList.get(0), wordvectorList.get(1),
                compositionMethod, alpha, beta, gamma, lambda);

        for (int i = 2; i < wordvectorList.size(); i++) {
            wv = composeWordVectors(wv, wordvectorList.get(i), compositionMethod, alpha, beta, gamma, lambda);
        }
        return wv;
    }

    /**
     * Utility function. Prints the word vector to standard output.
     * @param wordvector 
     */
    public void printWordVector(HashMap<String, Float> wordvector) {

        for (Iterator it = wordvector.keySet().iterator(); it.hasNext();) {
            String w = (String) it.next();
            System.out.println(w + "\t" + wordvector.get(w));
        }
    }

    /**
     * This method compares two word vectors using the similarity measure
     * SimilarityMeasures.KOLB that is described in the paper
     * <blockquote>Peter Kolb. <a href="http://hdl.handle.net/10062/9731">Experiments
     * on the difference between semantic similarity and relatedness</a>. In 
     * <i>Proceedings of the <a href="http://beta.visl.sdu.dk/nodalida2009/">17th
     * Nordic Conference on Computational Linguistics - NODALIDA '09</a></i>, 
     * Odense, Denmark, May 2009.</blockquote>
     * @param wv1 a word vector
     * @param wv2 another word vector
     * @return the similarity between the two word vectors; a value between 0.0F
     * and 1.0F.
     */
    private float computeSimilarityKolb(HashMap<String, Float> wv1, HashMap<String, Float> wv2) {

        float nenner = 0;
        for (Iterator it = wv1.keySet().iterator(); it.hasNext();) {
            nenner += wv1.get((String) it.next());
        }

        float zaehler = 0;
        for (Iterator it = wv2.keySet().iterator(); it.hasNext();) {
            String w = (String) it.next();
            float v = wv2.get(w);
            if (wv1.containsKey(w)) {
                zaehler += (v + wv1.get(w));
            }
            nenner += v;
        }
        return 2 * zaehler / nenner; // DICE-KOEFFIZIENT !
    }

    /**
     * This method compares two word vectors using the similarity measure
     * SimilarityMeasures.COSINE.
     * @param wv1 a word vector
     * @param wv2 another word vector
     * @return the similarity between the two word vectors; a value between -1.0F
     * and 1.0F. A return value of -2.0F indicates an error.
     */
    private float computeSimilarityCosine(HashMap<String, Float> wv1, HashMap<String, Float> wv2) {

        float nenner1 = 0.0F;
        for (Iterator it = wv1.keySet().iterator(); it.hasNext();) {
            float v = wv1.get((String) it.next());
            nenner1 += v * v;
        }

        float nenner2 = 0, zaehler = 0;
        for (Iterator it = wv2.keySet().iterator(); it.hasNext();) {
            String w = (String) it.next();
            float v = wv2.get(w);
            if (wv1.containsKey(w)) {
                zaehler += (v * wv1.get(w));
            }
            nenner2 += v * v;
        }
        return (float) (zaehler / Math.sqrt(nenner1 * nenner2));
    }

    /**
     * Computes the semantic similarity (according to the vector similarity 
     * measure <code>similarityMeasure</code>) between the two input word 
     * vectors.<br/>
     * @param wordvector1
     * @param wordvector2
     * @param similarityMeasure One of the similarity measures enumerated in
     * <code>DISCO.SimilarityMeasures</code>.
     * @return The similarity between the two input word vectors; depending on
     * the chosen similarity measure a value between 0.0F and 1.0F, or -1.0F and 
     * 1.0F. In case the <code>similarityMeasure</code> is unknown the return
     * value is -3.0F.
     */
    public float semanticSimilarity(HashMap<String, Float> wordvector1, HashMap<String, Float> wordvector2,
            SimilarityMeasures simMeasure) {

        if (simMeasure == SimilarityMeasures.KOLB) {
            return computeSimilarityKolb(wordvector1, wordvector2);
        } else if (simMeasure == SimilarityMeasures.COSINE) {
            return computeSimilarityCosine(wordvector1, wordvector2);
        } else {
            return -3.0F;
        }
    }

    /***************************************************************************
     * Computes the semantic similarity (according to the vector similarity 
     * measure <code>SimilarityMeasures.KOLB</code> which is described in 
     * <a href="http://hdl.handle.net/10062/9731">Kolb 2009</a>) between the 
     * two input word vectors.
     * @param wordvector1
     * @param wordvector2
     * @return The similarity between the two input word vectors; a value
     * between 0.0F and 1.0F.
     */
    public float semanticSimilarity(HashMap<String, Float> wordvector1, HashMap<String, Float> wordvector2) {

        return computeSimilarityKolb(wordvector1, wordvector2);
    }

    /**
     * This method computes the semantic similarity between two multi-word terms,
     * phrases, sentences or paragraphs. This is done by composition of the word
     * vectors of the constituent words.<br/>
     * Each of the two input strings is split at whitespace, and the wordvectors
     * of the individual tokens (constituent words) are retrieved. Then the
     * word vectors are combined using the method <code>composeWordVectors()</code>.
     * The two resulting vectors are then compared with 
     * <code>Compositionality.semanticSimilarity()</code>.
     * @param multiWords1 a tokenized string containing a multi-word term, phrase,
     * sentence or paragraph.
     * @param multiWords2 a tokenized string containing a multi-word term, phrase,
     * sentence or paragraph.
     * @param compositionMethod
     * @param simMeasure
     * @param disco a DISCO word space
     * @param alpha The parameters <code>alpha</code>, <code>beta</code> and 
     * <code>gamma</code> are used in the composition method 
     * <code>VectorCompositionMethod.COMBINED</code> (see there for more 
     * information). If one of the values is
     * <code>null</code> default values are used.
     * @param beta See description for parameter <code>alpha</code>!
     * @param gamma See description for parameter <code>alpha</code>!
     * @param lambda Used by <code>VectorCompositionMethod.DILATION</code> (see
     * there for more information). If
     * <code>lambda</code> is <code>null</code>, a default value is used.
     * @return the distributional similarity between <code>multiWord1</code> and
     * <code>multiWord2</code>.
     * @see de.linguatools.disco.Compositionality.VectorCompositionMethod
     * @see de.linguatools.disco.Compositionality.SimilarityMeasures
     */
    public float compositionalSemanticSimilarity(String multiWords1, String multiWords2,
            VectorCompositionMethod compositionMethod, SimilarityMeasures simMeasure, DISCO disco, Float alpha,
            Float beta, Float gamma, Float lambda) throws IOException {

        multiWords1 = multiWords1.trim();
        multiWords2 = multiWords2.trim();
        String[] multi1 = multiWords1.split("\\s+");
        String[] multi2 = multiWords2.split("\\s+");

        // compute word vector #1
        HashMap<String, Float> wv1 = new HashMap();
        if (multi1.length == 1) {
            wv1 = getWordvector(multi1[0], disco);
        } else if (multi1.length == 2) {
            wv1 = composeWordVectors(getWordvector(multi1[0], disco), getWordvector(multi1[1], disco),
                    compositionMethod, alpha, beta, gamma, lambda);
        } else {
            wv1 = composeWordVectors(getWordvector(multi1[0], disco), getWordvector(multi1[1], disco),
                    compositionMethod, alpha, beta, gamma, lambda);
            for (int i = 2; i < multi1.length; i++) {
                wv1 = composeWordVectors(wv1, getWordvector(multi1[i], disco), compositionMethod, alpha, beta,
                        gamma, lambda);
            }
        }

        // compute word vector 21
        HashMap<String, Float> wv2 = new HashMap();
        if (multi2.length == 1) {
            wv2 = getWordvector(multi2[0], disco);
        } else if (multi2.length == 2) {
            wv2 = composeWordVectors(getWordvector(multi2[0], disco), getWordvector(multi2[1], disco),
                    compositionMethod, alpha, beta, gamma, lambda);
        } else {
            wv2 = composeWordVectors(getWordvector(multi2[0], disco), getWordvector(multi2[1], disco),
                    compositionMethod, alpha, beta, gamma, lambda);
            for (int i = 2; i < multi2.length; i++) {
                wv2 = composeWordVectors(wv2, getWordvector(multi2[i], disco), compositionMethod, alpha, beta,
                        gamma, lambda);
            }
        }

        // compute similarity between the two word vectors
        return semanticSimilarity(wv1, wv2, simMeasure);
    }

    /**
     * Find the most similar words in the DISCO word space for an input word 
     * vector. While the word vector can represent a multi-token word (if it was
     * produced by one of the methods 
     * <code>Compositionality.composeWordVectors()</code>) the most
     * similar words will only be single-token words from the index.<br/>
     * <b>Warning</b>: This method is very time consuming and should only be
     * used with word spaces that have been loaded into memory!
     * @param wordvector input word vector
     * @param disco DISCO word space
     * @param simMeasure
     * @return List of all words (with their similarity values) whose similarity
     * with the <code>wordvector</code> is greater than zero, ordered by 
     * similarity value (highest value first).
     * @throws java.io.IOException
     */
    public ArrayList<ReturnDataCol> similarWords(HashMap<String, Float> wordvector, DISCO disco,
            SimilarityMeasures simMeasure) throws IOException {

        // hole einen IndexReader fuer das indexDir
        IndexReader ir = disco.getIndexReader();

        // durchlaufe alle Dokumente
        ArrayList<ReturnDataCol> result = new ArrayList();
        for (int i = 0; i < ir.numDocs(); i++) {
            Document doc = null;
            try {
                doc = ir.document(i);
            } catch (CorruptIndexException ex) {
                continue;
            } catch (IOException ex) {
                continue;
            }
            // Wortvektor zu Wort Nr. i holen
            String word = doc.get("word");
            HashMap<String, Float> wv = getWordvector(word, disco);
            // hnlichkeit zwischen Wortvektoren berechnen
            float sim = semanticSimilarity(wordvector, wv, simMeasure);
            if (sim > 0.0F) {
                ReturnDataCol r = new ReturnDataCol(word, sim);
                result.add(r);
            }
        }

        // nach hchstem hnlichkeitswert sortieren
        Collections.sort(result, new ValueComparator());

        return result;
    }
}