edu.ub.ahstfg.kmeans.document.DocumentCentroid.java Source code

Java tutorial

Introduction

Here is the source code for edu.ub.ahstfg.kmeans.document.DocumentCentroid.java

Source

/*
 * DocumentCentroid.java is part of HHCluster.
 *
 * HHCluster is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * HHCluster is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with HHCluster.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.ub.ahstfg.kmeans.document;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
import org.apache.log4j.Logger;

import edu.ub.ahstfg.io.WritableConverter;
import edu.ub.ahstfg.kmeans.Centroid;
import edu.ub.ahstfg.utils.Metrics;
import edu.ub.ahstfg.utils.Utils;

/**
 * Centroid implementation for document features.
 * @author Alberto Huelamo Segura
 */
public class DocumentCentroid implements Centroid, Writable {

    private static final Logger LOG = Logger.getLogger(DocumentCentroid.class);

    public static final boolean RANDOM = true;
    public static final int RANDOM_MAX_RANGE = 10;

    private short[] keywordVector;
    private short[] termVector;

    private double distance; //distance from previous centroid

    /**
     * Default argument constructor.
     */
    public DocumentCentroid() {
        this(10, 10);
    }

    /**
     * Parametrized constructor.
     * @param random True for randon init.
     * @param keywords Number of keywords.
     * @param terms Number of terms.
     */
    public DocumentCentroid(boolean random, int keywords, int terms) {
        this(keywords, terms);
        if (random) {
            for (int i = 0; i < keywords; i++) {
                keywordVector[i] = Utils.randomIntRange(5, RANDOM_MAX_RANGE);
            }
            for (int i = 0; i < terms; i++) {
                termVector[i] = Utils.randomIntRange(5, RANDOM_MAX_RANGE);
            }
        }
        distance = 0.0;
    }

    /**
     * Parametrized constructor.
     * @param keywords Number of keywords.
     * @param terms Number of terms.
     */
    public DocumentCentroid(int keywords, int terms) {
        keywordVector = new short[keywords];
        termVector = new short[terms];
    }

    /**
     * Parametrized constructor.
     * @param keywordVector Keyword frequency.
     * @param termVector Term frequency.
     */
    public DocumentCentroid(short[] keywordVector, short[] termVector) {
        this.keywordVector = keywordVector;
        this.termVector = termVector;
    }

    /**
     * Gets keyword frequency vector.
     * @return An array with keyword frequency.
     */
    public short[] getKeywordVector() {
        return keywordVector;
    }

    /**
     * Gets term frequency vector.
     * @return An array with term frequency.
     */
    public short[] getTermVector() {
        return termVector;
    }

    @Override
    public double getDistance() {
        return distance;
    }

    @Override
    public void setDistance(double distance) {
        this.distance = distance;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        WritableConverter.shortArray2ArrayWritable(keywordVector).write(out);
        WritableConverter.shortArray2ArrayWritable(termVector).write(out);
        DoubleWritable dist = new DoubleWritable(distance);
        dist.write(out);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        ArrayWritable k = new ArrayWritable(IntWritable.class);
        k.readFields(in);
        keywordVector = WritableConverter.arrayWritable2ShortArray(k);
        ArrayWritable t = new ArrayWritable(IntWritable.class);
        t.readFields(in);
        termVector = WritableConverter.arrayWritable2ShortArray(t);
        DoubleWritable dist = new DoubleWritable();
        dist.readFields(in);
        distance = dist.get();
    }

    @Override
    public void toHDFS(Path path) throws IOException {
        FileSystem fs = Utils.accessHDFS();
        FSDataOutputStream out = fs.create(path);
        write(out);
        out.close();
    }

    @Override
    public void fromHDFS(Path path) throws IOException {
        FileSystem fs = Utils.accessHDFS();
        FSDataInputStream in = fs.open(path);
        readFields(in);
        in.close();
    }

    /**
     * Computes the distance with other centroid.
     * @param other The other centroid.
     * @param wk Keyword distance weight.
     * @param wt Term distance weight.
     * @return The distance.
     */
    public double distance(DocumentCentroid other, float wk, float wt) {
        double keyDistance = Metrics.euclideanDistance(keywordVector, other.keywordVector);
        double termDistance = Metrics.euclideanDistance(termVector, other.termVector);
        return wk * keyDistance + wt * termDistance;
    }

    /**
     * Computes new centroid using mean of the frequencies.
     * @param nKeywords Numeber of keywords.
     * @param nTerms Number of terms.
     * @param keys Keyword frequency assigned to centroid.
     * @param terms Term frequency assigned to centroid.
     * @return The new centroid.
     */
    public static DocumentCentroid calculateCentroid(int nKeywords, int nTerms, ArrayList<short[]> keys,
            ArrayList<short[]> terms) {
        short[] keyFreq = new short[nKeywords];
        long sum;
        for (int j = 0; j < nKeywords; j++) {
            sum = 0;
            for (int i = 0; i < keys.size(); i++) {
                sum += keys.get(i)[j];
            }
            keyFreq[j] = (short) (sum / keys.size());
        }

        short[] termFreq = new short[nTerms];
        for (int j = 0; j < nTerms; j++) {
            sum = 0;
            for (int i = 0; i < terms.size(); i++) {
                sum += terms.get(i)[j];
            }
            termFreq[j] = (short) (sum / terms.size());
        }

        return new DocumentCentroid(keyFreq, termFreq);
    }

}