com.datumbox.framework.core.machinelearning.featureselection.continuous.PCA.java Source code

Introduction

Here is the source code for com.datumbox.framework.core.machinelearning.featureselection.continuous.PCA.java
Source

/**
 * Copyright (C) 2013-2016 Vasilis Vryniotis <bbriniotis@datumbox.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datumbox.framework.core.machinelearning.featureselection.continuous;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.concurrency.ForkJoinStream;
import com.datumbox.framework.common.concurrency.StreamMethods;
import com.datumbox.framework.common.dataobjects.AssociativeArray;
import com.datumbox.framework.common.dataobjects.Dataframe;
import com.datumbox.framework.common.dataobjects.MatrixDataframe;
import com.datumbox.framework.common.dataobjects.Record;
import com.datumbox.framework.common.persistentstorage.interfaces.BigMap;
import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector;
import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector.MapType;
import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector.StorageHint;
import com.datumbox.framework.common.utilities.PHPMethods;
import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer;
import com.datumbox.framework.core.machinelearning.common.abstracts.featureselectors.AbstractContinuousFeatureSelector;
import com.datumbox.framework.core.machinelearning.common.interfaces.Parallelizable;
import org.apache.commons.math3.linear.BlockRealMatrix;
import org.apache.commons.math3.linear.DiagonalMatrix;
import org.apache.commons.math3.linear.EigenDecomposition;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.stat.StatUtils;

import java.util.HashMap;
import java.util.Map;

/**
 * Implementation of Principal Component Analysis. The method can be used to project
 * the Dataframe to the orthogonal space and to eliminate components with low variance.
 * 
 * WARNING: This class copies the Dataframe to a RealMatrix which forces all of the
 * data to be loaded in memory.
 * 
 * References: 
 * Intelligent Data Analysis and Probabilistic Inference Slide 15
 * Advanced statistical machine learning and pattern recognition slides 2, tutorial 3, cw 1 matlab code
 * 
 * @author Vasilis Vryniotis <bbriniotis@datumbox.com>
 */
public class PCA extends AbstractContinuousFeatureSelector<PCA.ModelParameters, PCA.TrainingParameters>
        implements Parallelizable {

    /** {@inheritDoc} */
    public static class ModelParameters extends AbstractContinuousFeatureSelector.AbstractModelParameters {
        private static final long serialVersionUID = 1L;

        @BigMap(mapType = MapType.HASHMAP, storageHint = StorageHint.IN_MEMORY, concurrent = false)
        private Map<Object, Integer> featureIds;

        private int rows; //rows of the eigenvector matrix
        private int cols; //cols of the eigenvector matrix

        private double[] mean; //mean values for each column
        private double[] eigenValues; //eigenvalues

        private double[][] components; //components weights 

        /** 
         * @param dbc
         * @see AbstractTrainer.AbstractModelParameters#AbstractModelParameters(DatabaseConnector)
         */
        protected ModelParameters(DatabaseConnector dbc) {
            super(dbc);
        }

        /**
         * Getter for the mapping of the column names to column ids. The implementation
         * internally converts the data into double[] and as a result we need to 
         * estimate and store the mapping between the column names and their 
         * positions in the array. This mapping is estimated during training.
         * 
         * @return 
         */
        public Map<Object, Integer> getFeatureIds() {
            return featureIds;
        }

        /**
         * Setter for the mapping of the column names to column ids. 
         * 
         * @param featureIds 
         */
        protected void setFeatureIds(Map<Object, Integer> featureIds) {
            this.featureIds = featureIds;
        }

        /**
         * Getter for the number of rows of the eigenvector matrix.
         * 
         * @return 
         */
        public int getRows() {
            return rows;
        }

        /**
         * Setter for the number of rows of the eigenvector matrix.
         * 
         * @param rows 
         */
        protected void setRows(int rows) {
            this.rows = rows;
        }

        /**
         * Getter for the number of columns of the eigenvector matrix.
         * 
         * @return 
         */
        public int getCols() {
            return cols;
        }

        /**
         * Setter for the number of columns of the eigenvector matrix.
         * 
         * @param cols 
         */
        protected void setCols(int cols) {
            this.cols = cols;
        }

        /**
         * Getter for the mean values of each column.
         * 
         * @return 
         */
        public double[] getMean() {
            return PHPMethods.array_clone(mean);
        }

        /**
         * Setter for the mean values of each column.
         * 
         * @param mean 
         */
        protected void setMean(double[] mean) {
            this.mean = PHPMethods.array_clone(mean);
        }

        /**
         * Getter for the array with the Eigenvalues.
         * 
         * @return 
         */
        public double[] getEigenValues() {
            return PHPMethods.array_clone(eigenValues);
        }

        /**
         * Setter for the array with the Eigenvalues.
         * 
         * @param eigenValues 
         */
        protected void setEigenValues(double[] eigenValues) {
            this.eigenValues = PHPMethods.array_clone(eigenValues);
        }

        /**
         * Getter of the components matrix.
         * 
         * @return 
         */
        public double[][] getComponents() {
            return PHPMethods.array_clone(components);
        }

        /**
         * Setter of the components matrix.
         * 
         * @param components 
         */
        protected void setComponents(double[][] components) {
            this.components = PHPMethods.array_clone(components);
        }

    }

    /** {@inheritDoc} */
    public static class TrainingParameters extends AbstractContinuousFeatureSelector.AbstractTrainingParameters {
        private static final long serialVersionUID = 1L;

        private boolean whitened = false;
        private Integer maxDimensions = null;
        private Double variancePercentageThreshold = null;

        /**
         * Getter for whether we should run whitened PCA.
         * 
         * @return 
         */
        public boolean isWhitened() {
            return whitened;
        }

        /**
         * Setter for whether we should run whitened PCA.
         * 
         * @param whitened 
         */
        public void setWhitened(boolean whitened) {
            this.whitened = whitened;
        }

        /**
         * Getter for the maximum number of dimensions/components that should be 
         * kept by the algorithm.
         * 
         * @return 
         */
        public Integer getMaxDimensions() {
            return maxDimensions;
        }

        /**
         * Setter for the maximum number of dimensions/components that should be 
         * kept by the algorithm.
         * 
         * @param maxDimensions 
         */
        public void setMaxDimensions(Integer maxDimensions) {
            this.maxDimensions = maxDimensions;
        }

        /**
         * Getter for the variance percentage threshold. Setting this value will
         * cause the algorithm to keep the only fist X components whose the cumulative
         * variance percentage is higher than this number.
         * 
         * @return 
         */
        public Double getVariancePercentageThreshold() {
            return variancePercentageThreshold;
        }

        /**
         * Setter for the variance percentage threshold. Setting this value will
         * cause the algorithm to keep the only fist X components whose the cumulative
         * variance percentage is higher than this number.
         * 
         * @param variancePercentageThreshold 
         */
        public void setVariancePercentageThreshold(Double variancePercentageThreshold) {
            this.variancePercentageThreshold = variancePercentageThreshold;
        }

    }

    /**
     * Public constructor of the algorithm.
     * 
     * @param dbName
     * @param conf 
     */
    public PCA(String dbName, Configuration conf) {
        super(dbName, conf, PCA.ModelParameters.class, PCA.TrainingParameters.class);
        streamExecutor = new ForkJoinStream(kb().getConf().getConcurrencyConfig());
    }

    private boolean parallelized = true;

    /**
     * This executor is used for the parallel processing of streams with custom 
     * Thread pool.
     */
    protected final ForkJoinStream streamExecutor;

    /** {@inheritDoc} */
    @Override
    public boolean isParallelized() {
        return parallelized;
    }

    /** {@inheritDoc} */
    @Override
    public void setParallelized(boolean parallelized) {
        this.parallelized = parallelized;
    }

    /** {@inheritDoc} */
    @Override
    protected void _fit(Dataframe originalData) {
        ModelParameters modelParameters = kb().getModelParameters();

        int n = modelParameters.getN();
        int d = modelParameters.getD();

        //convert data into matrix
        Map<Object, Integer> featureIds = modelParameters.getFeatureIds();
        MatrixDataframe matrixDataset = MatrixDataframe.newInstance(originalData, false, null, featureIds);
        RealMatrix X = matrixDataset.getX();

        //calculate means and subtract them from data
        double[] meanValues = new double[d];
        for (Integer columnId : featureIds.values()) {

            meanValues[columnId] = 0.0;
            for (double v : X.getColumn(columnId)) {
                meanValues[columnId] += v;
            }
            meanValues[columnId] /= n;

            for (int row = 0; row < n; row++) {
                X.addToEntry(row, columnId, -meanValues[columnId]);
            }
        }
        modelParameters.setMean(meanValues);

        RealMatrix components;
        double[] eigenValues;

        //dxd matrix
        RealMatrix covarianceDD = (X.transpose().multiply(X)).scalarMultiply(1.0 / (n - 1.0));

        EigenDecomposition decomposition = new EigenDecomposition(covarianceDD);
        eigenValues = decomposition.getRealEigenvalues();

        components = decomposition.getV();

        //Whiten Components W = U*L^0.5; To whiten them we multiply with L^0.5.
        if (kb().getTrainingParameters().isWhitened()) {

            double[] sqrtEigenValues = new double[eigenValues.length];
            for (int i = 0; i < eigenValues.length; i++) {
                sqrtEigenValues[i] = Math.sqrt(eigenValues[i]);
            }

            components = components.multiply(new DiagonalMatrix(sqrtEigenValues));
        }

        //the eigenvalues and their components are sorted by descending order no need to resort them
        Integer maxDimensions = kb().getTrainingParameters().getMaxDimensions();
        Double variancePercentageThreshold = kb().getTrainingParameters().getVariancePercentageThreshold();
        if (variancePercentageThreshold != null && variancePercentageThreshold <= 1) {
            double sum = 0.0;
            double totalVariance = StatUtils.sum(eigenValues);
            int varCounter = 0;
            for (double l : eigenValues) {
                sum += l / totalVariance;
                varCounter++;
                if (sum >= variancePercentageThreshold) {
                    break;
                }
            }

            if (maxDimensions == null || maxDimensions > varCounter) {
                maxDimensions = varCounter;
            }
        }

        if (maxDimensions != null && maxDimensions < d) {
            //keep only the maximum selected eigenvalues
            double[] newEigenValues = new double[maxDimensions];
            System.arraycopy(eigenValues, 0, newEigenValues, 0, maxDimensions);
            eigenValues = newEigenValues;

            //keep only the maximum selected eigenvectors
            components = components.getSubMatrix(0, components.getRowDimension() - 1, 0, maxDimensions - 1);
        }

        modelParameters.setRows(components.getRowDimension());
        modelParameters.setCols(components.getColumnDimension());

        modelParameters.setEigenValues(eigenValues);
        modelParameters.setComponents(components.getData());
    }

    /** {@inheritDoc} */
    @Override
    protected void filterFeatures(Dataframe dataset) {
        ModelParameters modelParameters = kb().getModelParameters();

        //convert data into matrix
        Map<Object, Integer> featureIds = modelParameters.getFeatureIds();

        Map<Integer, Integer> recordIdsReference = new HashMap<>();
        MatrixDataframe matrixDataset = MatrixDataframe.parseDataset(dataset, recordIdsReference, featureIds);

        RealMatrix components = new BlockRealMatrix(modelParameters.getComponents());

        //multiplying the data with components
        final RealMatrix X = matrixDataset.getX().multiply(components);

        streamExecutor.forEach(StreamMethods.stream(dataset.entries(), isParallelized()), e -> {
            Integer rId = e.getKey();
            Record r = e.getValue();
            int rowId = recordIdsReference.get(rId);

            AssociativeArray xData = new AssociativeArray();
            int componentId = 0;
            for (double value : X.getRow(rowId)) {
                xData.put(componentId++, value);
            }

            Record newR = new Record(xData, r.getY(), r.getYPredicted(), r.getYPredictedProbabilities());

            //we call below the recalculateMeta()
            dataset._unsafe_set(rId, newR);
        });

        //recordIdsReference = null;
        //matrixDataset = null;

        dataset.recalculateMeta();
    }

}