de.uni_koeln.phil_fak.iv.tm.p4.classification.WekaAdapter.java Source code

Java tutorial

Introduction

Here is the source code for de.uni_koeln.phil_fak.iv.tm.p4.classification.WekaAdapter.java

Source

/**
 * Material for the course 'Text-Mining', University of Cologne.
 * (http://www.spinfo.phil-fak.uni-koeln.de/spinfo-textmining.html)
 * <p/>
 * Copyright (C) 2008-2009 Fabian Steeg
 * <p/>
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 3 of the License, or (at your option) any later
 * version.
 * <p/>
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 * <p/>
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, see <http://www.gnu.org/licenses/>.
 */
package de.uni_koeln.phil_fak.iv.tm.p4.classification;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import weka.classifiers.Classifier;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import de.uni_koeln.phil_fak.iv.ir.p5.features.FeatureVector;
import de.uni_koeln.phil_fak.iv.tm.p1.corpus.Corpus;
import de.uni_koeln.phil_fak.iv.tm.p1.corpus.Document;
import de.uni_koeln.phil_fak.iv.tm.p3.classification.ClassifierStrategy;

/**
 * Adapter for Weka classifiers.
 * @author Fabian Steeg (fsteeg)
 */
public class WekaAdapter implements ClassifierStrategy {

    private Classifier wekaClassifier;
    private int vectorSize;
    private List<String> classes;
    private Instances trainingSet;
    private Corpus corpus;
    private boolean classifierBuilt = false;

    /**
     * @param wekaClassifier The Weka classifier to adapt
     * @param trainingData The training documents
     * @param corpus The corpus
     */
    public WekaAdapter(Classifier wekaClassifier, Set<Document> trainingData, Corpus corpus) {
        this.wekaClassifier = wekaClassifier;
        this.corpus = corpus;
        // Fuer Weka brauchen wir jetzt ein paar Sachen:
        // 1. Die Groesse des Merkmalsvektors:
        FeatureVector vector = trainingData.iterator().next().getVector(corpus);
        this.vectorSize = vector.getValues().size();
        // 2. Die moegliche Klassen:
        this.classes = collectClasses(trainingData);
        // 3. Die Struktur der Trainingsdaten
        this.trainingSet = initTraininSet(trainingData);
    }

    private List<String> collectClasses(Set<Document> trainingData) {
        Set<String> classes = new HashSet<String>();
        for (Document document : trainingData) {
            classes.add(document.getTopic());
        }
        return new ArrayList<String>(classes);
    }

    private Instances initTraininSet(Set<Document> trainingData) {
        /* Der FastVector enthlt die Merkmale: */
        FastVector structureVector = new FastVector(vectorSize + 1);
        /* Die Klasse wird in Weka auch als Merkmalsvektor dargestellt: */
        FastVector classesVector = new FastVector(this.classes.size());
        for (String c : classes) {
            /*
             * Die Klasse ist nicht numerisch, deshalb muessen alle mglichen
             * Werte angegeben werden:
             */
            classesVector.addElement(c);
        }
        /* An Stelle 0 unseres Gesamtvektors kommt der Klassen-Vektor: */
        structureVector.addElement(new Attribute("Ressort", classesVector));
        for (int i = 0; i < vectorSize; i++) {
            /*
             * An jeder Position unseres Merkmalsvektors haben wir ein
             * numerisches Merkmal (reprsentiert als Attribute), dessen Name
             * sein Index ist:
             */
            structureVector.addElement(new Attribute(i + "")); // Merkmal i,
                                                               // d.h. was? >
                                                               // TF-IDF
        }
        /*
         * Schliesslich erstellen wir einen Container fr unsere
         * Trainingsbeispiele, der Instanzen der beschriebenen Merkmale
         * enthalten wird:
         */
        Instances result = new Instances("InstanceStructure", structureVector, vectorSize + 1);
        /*
         * Wobei wir noch angeben muessen, an welcher Stelle der
         * Merkmalsvektoren die Klasse zu finden ist:
         */
        result.setClassIndex(0);
        return result;
    }

    /**
     * {@inheritDoc}
     * @see de.uni_koeln.phil_fak.iv.tm.p3.ClassifierStrategy#train(de.uni_koeln.phil_fak.iv.tm.p1.corpus.Document,
     *      java.lang.String)
     */
    @Override
    public ClassifierStrategy train(Document document, String label) {
        trainingSet.add(instance(document, label));
        classifierBuilt = false;
        return this;
    }

    private Instance instance(Document document, String label) {
        List<Float> values = document.getVector(corpus).getValues();
        /* Die Instanz enthlt alle Merkmale plus die Klasse: */
        double[] vals = new double[values.size() + 1];
        for (int i = 0; i < values.size(); i++) {
            vals[i + 1] = values.get(i);
        }
        Instance instance = new Instance(1, vals);
        /*
         * Und muss erfahren, was die Werte bedeuten, was wir fr unser
         * Trainingsset beschrieben hatten:
         */
        instance.setDataset(trainingSet);
        /*
         * Beim Training haben wir Instanzen mit vorhandenem Klassenlabel, bei
         * der Klassifikation ist die Klasse unbekannt:
         */
        if (label == null) {
            instance.setClassMissing(); // during classification
        } else
            instance.setClassValue(label); // during training
        return instance;
    }

    /**
     * {@inheritDoc}
     * @see de.uni_koeln.phil_fak.iv.tm.p3.ClassifierStrategy#classify(de.uni_koeln.phil_fak.iv.tm.p1.corpus.Document)
     */
    @Override
    public String classify(Document document) {
        if (!classifierBuilt) {
            try {
                wekaClassifier.buildClassifier(trainingSet);
                classifierBuilt = true;
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        try {
            Instance instance = instance(document, null);
            int i = (int) wekaClassifier.classifyInstance(instance);
            // double[] distribution =
            // wekaClassifier.distributionForInstance(instance); < Alternative
            return classes.get(i);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public String toString() {
        return String.format("%s for %s", getClass().getSimpleName(), wekaClassifier.getClass().getSimpleName());
    }

}