onlinenewspopularity.DataFormatter.java Source code

Java tutorial

Introduction

Here is the source code for onlinenewspopularity.DataFormatter.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package onlinenewspopularity;

import Jama.Matrix;
import java.io.FileReader;
import java.io.File;
import java.io.FileWriter;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;

/**
 * This class reads data from a file and creates feature list, prediction column,
 * training data and test data
 * @author neeth
 */
public class DataFormatter {

    private static final Logger LOGGER = Logger.getLogger(DataFormatter.class.getName());

    private final String fileName;
    private double[][] trainStat;

    public DataFormatter(String fileName) {
        this.fileName = fileName;
    }

    /**
     * Reads the file and randomly populates the data
     * @return matrix list
     * The list has the following elements:
     * 1. List of features (mx1 ArrayList)
     * 2. Target column name
     * 3. Data for training (n1xm matrix)
     * 4. Target values for training data (n1x1 matrix)
     * 5. Test data (nxm matrix)
     * 6. Target values for test data (n2x2 matrix)
     * NOTE: n1 is the length of training data set.
     *       n2 is the length of test data set.
     *       n2 = Constants.SIZE*Constants.TEST_SET_RATIO
     *       n1 = Constants.SIZE-n2
     * @throws Exception 
     */
    public List<Matrix> readData() throws Exception {
        try {
            try (Reader br = new FileReader(new File(fileName))) {
                Iterable<CSVRecord> records = CSVFormat.DEFAULT.parse(br);

                List features = new ArrayList<>();
                String predictColName;

                Iterator<CSVRecord> itr = records.iterator();
                CSVRecord header = itr.next();

                features.add(Constants.FEATURE_COL1_NAME);
                for (int i = Constants.INITIAL_FEATURE_INDEX; i < header.size() - 1; i++) {
                    features.add(header.get(i).trim());
                }
                predictColName = header.get((header.size() - 1)).trim();

                trainStat = new double[2][features.size()];

                double[][] data = new double[Constants.SIZE][features.size()];
                double[][] res = new double[Constants.SIZE][1];
                boolean[] validFeature = new boolean[features.size()];
                int featureCount = 1;

                for (int i = 0; i < validFeature.length; i++) {
                    validFeature[i] = Boolean.FALSE; //Not a valid feature by default
                }

                List indices = new ArrayList<>();
                int n = Constants.SIZE;
                for (int i = 0; i < n; i++) {
                    indices.add(i);
                }
                Random randGen = new Random();

                validFeature[0] = Boolean.TRUE; //theta_0 is a valid feature
                int i = 0;
                for (CSVRecord record : records) {
                    if (i < Constants.SIZE && !indices.isEmpty()) {
                        int index = (int) indices.get(randGen.nextInt(indices.size()));
                        for (int j = 0; j <= features.size(); j++) {
                            if (j == 0) {
                                data[index][j] = 1.0;
                            } else if (j == features.size()) {
                                res[index][0] = Double.parseDouble(record.get(record.size() - 1));
                            } else {
                                data[index][j] = Double
                                        .parseDouble(record.get(j + Constants.INITIAL_FEATURE_INDEX - 1));
                                if (data[index][j] != 0) {
                                    if (validFeature[j] == Boolean.FALSE) {
                                        featureCount++;
                                        validFeature[j] = Boolean.TRUE;
                                    }
                                }
                            }
                        }
                        indices.remove((Object) index);
                    } else {
                        break;
                    }
                    i++;
                }

                //Remove empty features
                if (featureCount < features.size()) {
                    List featuresCopy = new ArrayList<>();
                    featuresCopy.addAll(features);
                    double[][] newData = new double[Constants.SIZE][featureCount];
                    int k = 0;
                    int var = 0;

                    for (int j = 0; j < featuresCopy.size(); j++) {
                        if (validFeature[j] == Boolean.TRUE) {
                            for (i = 0; i < Constants.SIZE; i++) {
                                newData[i][k] = data[i][j];
                            }
                            k++;
                        } else {
                            LOGGER.log(Level.INFO, "Removing empty feature: {0}", features.get(j - var));
                            features.remove(j - var);
                            var++;
                        }
                    }

                    data = newData;
                }

                int testLen = (int) (Constants.TEST_SET_RATIO * Constants.SIZE);
                int trainLen = Constants.SIZE - testLen;

                Matrix tmpx = new Matrix(data);
                Matrix tmpy = new Matrix(res);

                List temp = new ArrayList<>();
                temp.add(features);
                temp.add(predictColName);
                temp.add(tmpx.getMatrix(0, trainLen - 1, 0, tmpx.getColumnDimension() - 1));
                temp.add(tmpy.getMatrix(0, trainLen - 1, 0, tmpy.getColumnDimension() - 1));
                temp.add(tmpx.getMatrix(trainLen, tmpx.getRowDimension() - 1, 0, tmpx.getColumnDimension() - 1));
                temp.add(tmpy.getMatrix(trainLen, tmpy.getRowDimension() - 1, 0, tmpy.getColumnDimension() - 1));

                return temp;
            }
        } catch (Exception e) {
            LOGGER.log(Level.WARNING, "{0}: {1}", new Object[] { e.getClass().getName(), e.getMessage() });
            throw e;
        }
    }

    public List resetData(Matrix data, Matrix res, List featureIndices, List features, String predictColumn) {
        try {
            int n = data.getRowDimension();
            Matrix newData = new Matrix(n, featureIndices.size());
            Matrix newRes = new Matrix(n, 1);
            List newFeatures = new ArrayList<>();

            for (int i = 0; i < featureIndices.size(); i++) {
                newFeatures.add(features.get((int) featureIndices.get(i)));
            }

            List indices = new ArrayList<>();
            for (int i = 0; i < n; i++) {
                indices.add(i);
            }
            Random randGen = new Random();

            for (int i = 0; i < data.getRowDimension(); i++) {
                int index = (int) indices.get(randGen.nextInt(indices.size()));
                int k = 0;
                for (int j = 0; j < data.getColumnDimension(); j++) {
                    if (featureIndices.contains((Object) j)) {
                        newData.set(index, k, data.get(i, j));
                        k++;
                    }
                }
                newRes.set(index, 0, res.get(i, 0));
                indices.remove((Object) index);
            }

            List temp = new ArrayList();
            temp.add(newFeatures);
            temp.add(predictColumn);
            temp.add(newData);
            temp.add(newRes);

            return temp;
        } catch (Exception e) {
            LOGGER.log(Level.WARNING, "{0}: {1}", new Object[] { e.getClass().getName(), e.getMessage() });
            throw e;
        }
    }

    public void writeDataSetToFile(String fileName, List features, String predicColumn, Matrix mat1, Matrix mat2)
            throws Exception {
        try (FileWriter fw = new FileWriter(new File(fileName))) {
            if (mat1.getColumnDimension() != features.size()) {
                throw new Exception("Number of headers and data columns do not " + "match. headers: "
                        + features.size() + " | columns" + mat1.getColumnDimension());
            }

            StringBuilder line = new StringBuilder();
            for (int i = 0; i < features.size(); i++) {
                line.append(features.get(i)).append(",");
            }
            line.append(predicColumn).append("\n");
            fw.write(line.toString());
            for (int i = 0; i < mat1.getRowDimension(); i++) {
                line = new StringBuilder();
                fw.flush();
                for (int j = 0; j < mat1.getColumnDimension(); j++) {
                    line.append(mat1.get(i, j)).append(",");
                }
                line.append(mat2.get(i, 0)).append("\n");
                fw.write(line.toString());
            }
            LOGGER.log(Level.INFO, "Data written to {0}", fileName);
        } catch (Exception e) {
            LOGGER.log(Level.WARNING, "{0}: {1}", new Object[] { e.getClass().getName(), e.getMessage() });
            throw e;
        }
    }

    public double[][] getDataStat() {
        return trainStat;
    }
}