org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.java Source code

Introduction

Here is the source code for org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.instructions.spark.utils;

import java.io.IOException;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.hadoop.io.Text;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.ml.linalg.Vectors;
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
import org.apache.spark.mllib.util.NumericParser;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixCell;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue;
import org.apache.sysml.runtime.matrix.mapred.ReblockBuffer;
import org.apache.sysml.runtime.util.FastStringTokenizer;

import scala.Tuple2;

/**
 * NOTE: These are experimental converter utils. Once thoroughly tested, they
 * can be moved to RDDConverterUtils.
 */
@SuppressWarnings("unused")
public class RDDConverterUtilsExt {
    public enum RDDConverterTypes {
        TEXT_TO_MATRIX_CELL, MATRIXENTRY_TO_MATRIXCELL
    }

    /**
     * Example usage:
     * <pre><code>
     * import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt
     * import org.apache.sysml.runtime.matrix.MatrixCharacteristics
     * import org.apache.spark.api.java.JavaSparkContext
     * import org.apache.spark.mllib.linalg.distributed.MatrixEntry
     * import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
     * val matRDD = sc.textFile("ratings.text").map(_.split(" ")).map(x =&gt; new MatrixEntry(x(0).toLong, x(1).toLong, x(2).toDouble)).filter(_.value != 0).cache
     * require(matRDD.filter(x =&gt; x.i == 0 || x.j == 0).count == 0, "Expected 1-based ratings file")
     * val nnz = matRDD.count
     * val numRows = matRDD.map(_.i).max
     * val numCols = matRDD.map(_.j).max
     * val coordinateMatrix = new CoordinateMatrix(matRDD, numRows, numCols)
     * val mc = new MatrixCharacteristics(numRows, numCols, 1000, 1000, nnz)
     * val binBlocks = RDDConverterUtilsExt.coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), coordinateMatrix, mc, true)
     * </code></pre>
     *
     * @param sc java spark context
     * @param input coordinate matrix
     * @param mcIn matrix characteristics
     * @param outputEmptyBlocks if true, inject empty blocks if necessary
     * @return matrix as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
     * @throws DMLRuntimeException if DMLRuntimeException occurs
     */
    public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(JavaSparkContext sc,
            CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks)
            throws DMLRuntimeException {
        //convert matrix entry rdd to binary block rdd (w/ partial blocks)
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.entries().toJavaRDD()
                .mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn));

        //inject empty blocks (if necessary)
        if (outputEmptyBlocks && mcIn.mightHaveEmptyBlocks()) {
            out = out.union(SparkUtils.getEmptyBlockRDD(sc, mcIn));
        }

        //aggregate partial matrix blocks
        out = RDDAggregateUtils.mergeByKey(out, false);

        return out;
    }

    public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(SparkContext sc,
            CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks)
            throws DMLRuntimeException {
        return coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), input, mcIn, true);
    }

    public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns)
            throws DMLRuntimeException {
        ArrayList<String> columnToSelect = new ArrayList<String>();
        for (int i = 1; i < columns.size(); i++) {
            columnToSelect.add(columns.get(i));
        }
        return df.select(columns.get(0), scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList());
    }

    public static MatrixBlock convertPy4JArrayToMB(byte[] data, long rlen, long clen) throws DMLRuntimeException {
        return convertPy4JArrayToMB(data, (int) rlen, (int) clen, false);
    }

    public static MatrixBlock convertPy4JArrayToMB(byte[] data, int rlen, int clen) throws DMLRuntimeException {
        return convertPy4JArrayToMB(data, rlen, clen, false);
    }

    public static MatrixBlock convertSciPyCOOToMB(byte[] data, byte[] row, byte[] col, long rlen, long clen,
            long nnz) throws DMLRuntimeException {
        return convertSciPyCOOToMB(data, row, col, (int) rlen, (int) clen, (int) nnz);
    }

    public static MatrixBlock convertSciPyCOOToMB(byte[] data, byte[] row, byte[] col, int rlen, int clen, int nnz)
            throws DMLRuntimeException {
        MatrixBlock mb = new MatrixBlock(rlen, clen, true);
        mb.allocateSparseRowsBlock(false);
        ByteBuffer buf1 = ByteBuffer.wrap(data);
        buf1.order(ByteOrder.nativeOrder());
        ByteBuffer buf2 = ByteBuffer.wrap(row);
        buf2.order(ByteOrder.nativeOrder());
        ByteBuffer buf3 = ByteBuffer.wrap(col);
        buf3.order(ByteOrder.nativeOrder());
        for (int i = 0; i < nnz; i++) {
            double val = buf1.getDouble();
            int rowIndex = buf2.getInt();
            int colIndex = buf3.getInt();
            mb.setValue(rowIndex, colIndex, val);
        }
        mb.recomputeNonZeros();
        mb.examSparsity();
        return mb;
    }

    public static MatrixBlock convertPy4JArrayToMB(byte[] data, long rlen, long clen, boolean isSparse)
            throws DMLRuntimeException {
        return convertPy4JArrayToMB(data, (int) rlen, (int) clen, isSparse);
    }

    public static MatrixBlock allocateDenseOrSparse(int rlen, int clen, boolean isSparse) {
        MatrixBlock ret = new MatrixBlock(rlen, clen, isSparse);
        ret.allocateBlock();
        return ret;
    }

    public static MatrixBlock allocateDenseOrSparse(long rlen, long clen, boolean isSparse)
            throws DMLRuntimeException {
        if (rlen > Integer.MAX_VALUE || clen > Integer.MAX_VALUE) {
            throw new DMLRuntimeException(
                    "Dimensions of matrix are too large to be passed via NumPy/SciPy:" + rlen + " X " + clen);
        }
        return allocateDenseOrSparse(rlen, clen, isSparse);
    }

    public static void copyRowBlocks(MatrixBlock mb, int rowIndex, MatrixBlock ret, int numRowsPerBlock, int rlen,
            int clen) throws DMLRuntimeException {
        copyRowBlocks(mb, (long) rowIndex, ret, (long) numRowsPerBlock, (long) rlen, (long) clen);
    }

    public static void copyRowBlocks(MatrixBlock mb, long rowIndex, MatrixBlock ret, int numRowsPerBlock, int rlen,
            int clen) throws DMLRuntimeException {
        copyRowBlocks(mb, (long) rowIndex, ret, (long) numRowsPerBlock, (long) rlen, (long) clen);
    }

    public static void copyRowBlocks(MatrixBlock mb, int rowIndex, MatrixBlock ret, long numRowsPerBlock, long rlen,
            long clen) throws DMLRuntimeException {
        copyRowBlocks(mb, (long) rowIndex, ret, (long) numRowsPerBlock, (long) rlen, (long) clen);
    }

    public static void copyRowBlocks(MatrixBlock mb, long rowIndex, MatrixBlock ret, long numRowsPerBlock,
            long rlen, long clen) throws DMLRuntimeException {
        // TODO: Double-check if synchronization is required here.
        // synchronized (RDDConverterUtilsExt.class) {
        ret.copy((int) (rowIndex * numRowsPerBlock), (int) Math.min((rowIndex + 1) * numRowsPerBlock - 1, rlen - 1),
                0, (int) (clen - 1), mb, false);
        // }
    }

    public static void postProcessAfterCopying(MatrixBlock ret) throws DMLRuntimeException {
        ret.recomputeNonZeros();
        ret.examSparsity();
    }

    public static MatrixBlock convertPy4JArrayToMB(byte[] data, int rlen, int clen, boolean isSparse)
            throws DMLRuntimeException {
        MatrixBlock mb = new MatrixBlock(rlen, clen, isSparse, -1);
        if (isSparse) {
            throw new DMLRuntimeException("Convertion to sparse format not supported");
        } else {
            long limit = rlen * clen;
            if (limit > Integer.MAX_VALUE)
                throw new DMLRuntimeException(
                        "Dense NumPy array of size " + limit + " cannot be converted to MatrixBlock");
            double[] denseBlock = new double[(int) limit];
            ByteBuffer buf = ByteBuffer.wrap(data);
            buf.order(ByteOrder.nativeOrder());
            for (int i = 0; i < rlen * clen; i++) {
                denseBlock[i] = buf.getDouble();
            }
            mb.init(denseBlock, rlen, clen);
        }
        mb.recomputeNonZeros();
        mb.examSparsity();
        return mb;
    }

    public static byte[] convertMBtoPy4JDenseArr(MatrixBlock mb) throws DMLRuntimeException {
        byte[] ret = null;
        if (mb.isInSparseFormat()) {
            mb.sparseToDense();
        }

        long limit = mb.getNumRows() * mb.getNumColumns();
        int times = Double.SIZE / Byte.SIZE;
        if (limit > Integer.MAX_VALUE / times)
            throw new DMLRuntimeException(
                    "MatrixBlock of size " + limit + " cannot be converted to dense numpy array");
        ret = new byte[(int) (limit * times)];

        double[] denseBlock = mb.getDenseBlock();
        if (mb.isEmptyBlock()) {
            for (int i = 0; i < limit; i++) {
                ByteBuffer.wrap(ret, i * times, times).order(ByteOrder.nativeOrder()).putDouble(0);
            }
        } else if (denseBlock == null) {
            throw new DMLRuntimeException("Error while dealing with empty blocks.");
        } else {
            for (int i = 0; i < denseBlock.length; i++) {
                ByteBuffer.wrap(ret, i * times, times).order(ByteOrder.nativeOrder()).putDouble(denseBlock[i]);
            }
        }

        return ret;
    }

    public static class AddRowID implements Function<Tuple2<Row, Long>, Row> {
        private static final long serialVersionUID = -3733816995375745659L;

        @Override
        public Row call(Tuple2<Row, Long> arg0) throws Exception {
            int oldNumCols = arg0._1.length();
            Object[] fields = new Object[oldNumCols + 1];
            for (int i = 0; i < oldNumCols; i++) {
                fields[i] = arg0._1.get(i);
            }
            fields[oldNumCols] = new Double(arg0._2 + 1);
            return RowFactory.create(fields);
        }

    }

    /**
     * Add element indices as new column to DataFrame
     *
     * @param df input data frame
     * @param sparkSession the Spark Session
     * @param nameOfCol name of index column
     * @return new data frame
     */
    public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) {
        StructField[] oldSchema = df.schema().fields();
        StructField[] newSchema = new StructField[oldSchema.length + 1];
        for (int i = 0; i < oldSchema.length; i++) {
            newSchema[i] = oldSchema[i];
        }
        newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false);
        // JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID());
        JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID());
        return sparkSession.createDataFrame(newRows, new StructType(newSchema));
    }

    private static class MatrixEntryToBinaryBlockFunction
            implements PairFlatMapFunction<Iterator<MatrixEntry>, MatrixIndexes, MatrixBlock> {
        private static final long serialVersionUID = 4907483236186747224L;

        private IJVToBinaryBlockFunctionHelper helper = null;

        public MatrixEntryToBinaryBlockFunction(MatrixCharacteristics mc) throws DMLRuntimeException {
            helper = new IJVToBinaryBlockFunctionHelper(mc);
        }

        @Override
        public Iterator<Tuple2<MatrixIndexes, MatrixBlock>> call(Iterator<MatrixEntry> arg0) throws Exception {
            return helper.convertToBinaryBlock(arg0, RDDConverterTypes.MATRIXENTRY_TO_MATRIXCELL).iterator();
        }

    }

    private static class IJVToBinaryBlockFunctionHelper implements Serializable {
        private static final long serialVersionUID = -7952801318564745821L;
        //internal buffer size (aligned w/ default matrix block size)
        private static final int BUFFER_SIZE = 4 * 1000 * 1000; //4M elements (32MB)
        private int _bufflen = -1;

        private long _rlen = -1;
        private long _clen = -1;
        private int _brlen = -1;
        private int _bclen = -1;

        public IJVToBinaryBlockFunctionHelper(MatrixCharacteristics mc) throws DMLRuntimeException {
            if (!mc.dimsKnown()) {
                throw new DMLRuntimeException(
                        "The dimensions need to be known in given MatrixCharacteristics for given input RDD");
            }
            _rlen = mc.getRows();
            _clen = mc.getCols();
            _brlen = mc.getRowsPerBlock();
            _bclen = mc.getColsPerBlock();

            //determine upper bounded buffer len
            _bufflen = (int) Math.min(_rlen * _clen, BUFFER_SIZE);

        }

        // ----------------------------------------------------
        // Can extend this by having type hierarchy
        public Tuple2<MatrixIndexes, MatrixCell> textToMatrixCell(Text txt) {
            FastStringTokenizer st = new FastStringTokenizer(' ');
            //get input string (ignore matrix market comments)
            String strVal = txt.toString();
            if (strVal.startsWith("%"))
                return null;

            //parse input ijv triple
            st.reset(strVal);
            long row = st.nextLong();
            long col = st.nextLong();
            double val = st.nextDouble();
            MatrixIndexes indx = new MatrixIndexes(row, col);
            MatrixCell cell = new MatrixCell(val);
            return new Tuple2<MatrixIndexes, MatrixCell>(indx, cell);
        }

        public Tuple2<MatrixIndexes, MatrixCell> matrixEntryToMatrixCell(MatrixEntry entry) {
            MatrixIndexes indx = new MatrixIndexes(entry.i(), entry.j());
            MatrixCell cell = new MatrixCell(entry.value());
            return new Tuple2<MatrixIndexes, MatrixCell>(indx, cell);
        }

        // ----------------------------------------------------

        Iterable<Tuple2<MatrixIndexes, MatrixBlock>> convertToBinaryBlock(Object arg0, RDDConverterTypes converter)
                throws Exception {
            ArrayList<Tuple2<MatrixIndexes, MatrixBlock>> ret = new ArrayList<Tuple2<MatrixIndexes, MatrixBlock>>();
            ReblockBuffer rbuff = new ReblockBuffer(_bufflen, _rlen, _clen, _brlen, _bclen);

            Iterator<?> iter = (Iterator<?>) arg0;
            while (iter.hasNext()) {
                Tuple2<MatrixIndexes, MatrixCell> cell = null;
                switch (converter) {
                case MATRIXENTRY_TO_MATRIXCELL:
                    cell = matrixEntryToMatrixCell((MatrixEntry) iter.next());
                    break;

                case TEXT_TO_MATRIX_CELL:
                    cell = textToMatrixCell((Text) iter.next());
                    break;

                default:
                    throw new Exception("Invalid converter for IJV data:" + converter.toString());
                }

                if (cell == null) {
                    continue;
                }

                //flush buffer if necessary
                if (rbuff.getSize() >= rbuff.getCapacity())
                    flushBufferToList(rbuff, ret);

                //add value to reblock buffer
                rbuff.appendCell(cell._1.getRowIndex(), cell._1.getColumnIndex(), cell._2.getValue());
            }

            //final flush buffer
            flushBufferToList(rbuff, ret);

            return ret;
        }

        private static void flushBufferToList(ReblockBuffer rbuff,
                ArrayList<Tuple2<MatrixIndexes, MatrixBlock>> ret) throws IOException, DMLRuntimeException {
            //temporary list of indexed matrix values to prevent library dependencies
            ArrayList<IndexedMatrixValue> rettmp = new ArrayList<IndexedMatrixValue>();
            rbuff.flushBufferToBinaryBlocks(rettmp);
            ret.addAll(SparkUtils.fromIndexedMatrixBlock(rettmp));
        }
    }

    /**
     * Convert a dataframe of comma-separated string rows to a dataframe of
     * ml.linalg.Vector rows.
     *
     * <p>
     * Example input rows:<br>
     *
     * <code>
     * ((1.2, 4.3, 3.4))<br>
     * (1.2, 3.4, 2.2)<br>
     * [[1.2, 34.3, 1.2, 1.25]]<br>
     * [1.2, 3.4]<br>
     * </code>
     *
     * @param sparkSession
     *            Spark Session
     * @param inputDF
     *            dataframe of comma-separated row strings to convert to
     *            dataframe of ml.linalg.Vector rows
     * @return dataframe of ml.linalg.Vector rows
     * @throws DMLRuntimeException
     *             if DMLRuntimeException occurs
     */
    public static Dataset<Row> stringDataFrameToVectorDataFrame(SparkSession sparkSession, Dataset<Row> inputDF)
            throws DMLRuntimeException {

        StructField[] oldSchema = inputDF.schema().fields();
        StructField[] newSchema = new StructField[oldSchema.length];
        for (int i = 0; i < oldSchema.length; i++) {
            String colName = oldSchema[i].name();
            newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true);
        }

        // converter
        class StringToVector implements Function<Tuple2<Row, Long>, Row> {
            private static final long serialVersionUID = -4733816995375745659L;

            @Override
            public Row call(Tuple2<Row, Long> arg0) throws Exception {
                Row oldRow = arg0._1;
                int oldNumCols = oldRow.length();
                if (oldNumCols > 1) {
                    throw new DMLRuntimeException("The row must have at most one column");
                }

                // parse the various strings. i.e
                // ((1.2, 4.3, 3.4)) or (1.2, 3.4, 2.2)
                // [[1.2, 34.3, 1.2, 1.2]] or [1.2, 3.4]
                Object[] fields = new Object[oldNumCols];
                ArrayList<Object> fieldsArr = new ArrayList<Object>();
                for (int i = 0; i < oldRow.length(); i++) {
                    Object ci = oldRow.get(i);
                    if (ci == null) {
                        fieldsArr.add(null);
                    } else if (ci instanceof String) {
                        String cis = (String) ci;
                        StringBuffer sb = new StringBuffer(cis.trim());
                        for (int nid = 0; i < 2; i++) { // remove two level
                            // nesting
                            if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')')
                                    || (sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')) {
                                sb.deleteCharAt(0);
                                sb.setLength(sb.length() - 1);
                            }
                        }
                        // have the replace code
                        String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]";

                        try {
                            // ncis [ ] will always result in double array return type
                            double[] doubles = (double[]) NumericParser.parse(ncis);
                            Vector dense = Vectors.dense(doubles);
                            fieldsArr.add(dense);
                        } catch (Exception e) { // can't catch SparkException here in Java apparently
                            throw new DMLRuntimeException("Error converting to double array. " + e.getMessage(), e);
                        }

                    } else {
                        throw new DMLRuntimeException("Only String is supported");
                    }
                }
                Row row = RowFactory.create(fieldsArr.toArray());
                return row;
            }
        }

        // output DF
        JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
        Dataset<Row> outDF = sparkSession.createDataFrame(newRows.rdd(), DataTypes.createStructType(newSchema));
        return outDF;
    }
}