com.ibm.bi.dml.runtime.util.MapReduceTool.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.util.MapReduceTool.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.parser.DataExpression;
import com.ibm.bi.dml.parser.Expression.ValueType;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.io.MatrixReader;
import com.ibm.bi.dml.runtime.io.MatrixReaderFactory;
import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics;
import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.NumItemsByEachReducerMetaData;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.sort.ReadWithZeros;

public class MapReduceTool {
    private static final Log LOG = LogFactory.getLog(MapReduceTool.class.getName());
    private static JobConf _rJob = null; //cached job conf for read-only operations

    static {
        _rJob = ConfigurationManager.getCachedJobConf();
    }

    public static String getUniqueKeyPerTask(JobConf job, boolean inMapper) {
        //TODO: investigate ID pattern, required for parallel jobs
        /*String nodePrefix = job.get("mapred.task.id");
        return String.valueOf(IDHandler.extractLongID(nodePrefix));*/

        String nodePrefix = job.get("mapred.task.id");
        int i;
        if (inMapper)
            i = nodePrefix.indexOf("_m_");
        else
            i = nodePrefix.indexOf("_r_");
        int j = nodePrefix.lastIndexOf("_");
        nodePrefix = nodePrefix.substring(i + 3, j);
        // remove all the leading 0s
        return String.valueOf(Long.parseLong(nodePrefix));
    }

    @Deprecated
    public static String getUniqueKeyPerTaskWithLeadingZros(JobConf job, boolean inMapper) {
        String nodePrefix = job.get("mapred.task.id");
        int i;
        if (inMapper)
            i = nodePrefix.indexOf("_m_");
        else
            i = nodePrefix.indexOf("_r_");
        int j = nodePrefix.lastIndexOf("_");
        nodePrefix = nodePrefix.substring(i + 3, j);
        return nodePrefix;
    }

    public static int getUniqueTaskId(JobConf job) {
        //TODO: investigate ID pattern, required for parallel jobs
        /*String nodePrefix = job.get("mapred.task.id"); 
        return IDHandler.extractIntID(nodePrefix);*/

        String nodePrefix = job.get("mapred.task.id");
        int j = nodePrefix.lastIndexOf("_");
        int i = nodePrefix.lastIndexOf("_", j - 1);
        nodePrefix = nodePrefix.substring(i + 1, j);
        // System.out.println("nodePrefix = " + nodePrefix) ;
        return Integer.valueOf(nodePrefix);
    }

    public static String getGloballyUniqueName(JobConf job) {
        return job.get("mapred.task.id");
    }

    public static boolean existsFileOnHDFS(String fname) {
        boolean ret = true;
        try {
            Path outpath = new Path(fname);
            ret = FileSystem.get(_rJob).exists(outpath);
        } catch (Exception ex) {
            LOG.error("Exception caught in existsFileOnHDFS", ex);
            ret = false;
        }
        return ret;
    }

    public static void deleteFileIfExistOnHDFS(Path outpath, JobConf job) throws IOException {
        if (FileSystem.get(job).exists(outpath)) {
            FileSystem.get(job).delete(outpath, true);
        }
    }

    public static void deleteFileIfExistOnLFS(Path outpath, JobConf job) throws IOException {
        if (FileSystem.getLocal(job).exists(outpath)) {
            FileSystem.getLocal(job).delete(outpath, true);
        }
    }

    public static void deleteFileIfExistOnHDFS(String dir) throws IOException {
        Path outpath = new Path(dir);
        FileSystem fs = FileSystem.get(_rJob);
        if (fs.exists(outpath)) {
            //System.err.println("Deleting " + outpath + " ... ");
            fs.delete(outpath, true);
        }
    }

    public static boolean isHDFSDirectory(String dir) throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        Path pth = new Path(dir);
        FileStatus fstat = fs.getFileStatus(pth);
        return fstat.isDirectory();
    }

    public static boolean isHDFSFileEmpty(String dir) throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        return isFileEmpty(fs, dir);
    }

    public static boolean isFileEmpty(FileSystem fs, String dir) throws IOException {
        Path pth = new Path(dir);
        FileStatus fstat = fs.getFileStatus(pth);

        if (fstat.isDirectory()) {
            // it is a directory
            FileStatus[] stats = fs.listStatus(pth);
            if (stats != null) {
                for (FileStatus stat : stats) {
                    if (stat.getLen() > 0)
                        return false;
                }
                return true;
            } else {
                return true;
            }
        } else {
            // it is a regular file
            if (fstat.getLen() == 0)
                return true;
            else
                return false;
        }
    }

    public static void renameFileOnHDFS(String originalDir, String newDir) throws IOException {
        Path originalpath = new Path(originalDir);

        deleteFileIfExistOnHDFS(newDir);
        Path newpath = new Path(newDir);

        FileSystem fs = FileSystem.get(_rJob);
        if (fs.exists(originalpath)) {
            fs.rename(originalpath, newpath);
        } else {
            throw new FileNotFoundException(originalDir);
        }
    }

    public static void mergeIntoSingleFile(String originalDir, String newFile) throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        FileUtil.copyMerge(fs, new Path(originalDir), fs, new Path(newFile), true, _rJob, null);
    }

    public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException {
        Path originalPath = new Path(originalDir);
        Path newPath = new Path(newDir);
        boolean deleteSource = false;
        boolean overwrite = true;

        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        if (fs.exists(originalPath)) {
            FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job);
        }
    }

    /**
     * 
     * @param dir
     * @return
     * @throws IOException
     */
    public static String getSubDirs(String dir) throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        FileStatus[] files = fs.listStatus(new Path(dir));
        StringBuilder sb = new StringBuilder();
        for (FileStatus file : files) {
            if (sb.length() > 0)
                sb.append(",");
            sb.append(file.getPath().toString());
        }
        return sb.toString();
    }

    /**
     * 
     * @param dir
     * @return
     * @throws IOException
     */
    public static String getSubDirsIgnoreLogs(String dir) throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        FileStatus[] files = fs.listStatus(new Path(dir));
        StringBuilder sb = new StringBuilder();
        for (FileStatus file : files) {
            String name = file.getPath().toString();
            if (name.contains("_logs"))
                continue;
            if (sb.length() > 0)
                sb.append(",");
            sb.append(name);
        }
        return sb.toString();
    }

    /**
     * Returns the size of a file or directory on hdfs in bytes.
     * 
     * @param path
     * @return
     * @throws IOException
     */
    public static long getFilesizeOnHDFS(Path path) throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        long ret = 0; //in bytes
        if (fs.isDirectory(path))
            ret = fs.getContentSummary(path).getLength();
        else
            ret = fs.getFileStatus(path).getLen();
        //note: filestatus would return 0 on directories

        return ret;
    }

    private static BufferedReader setupInputFile(String filename) throws IOException {
        Path pt = new Path(filename);
        FileSystem fs = FileSystem.get(_rJob);
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt)));
        return br;
    }

    public static double readDoubleFromHDFSFile(String filename) throws IOException {
        BufferedReader br = setupInputFile(filename);
        String line = br.readLine();
        br.close();
        if (line == null)
            throw new IOException("Empty file on hdfs: " + filename);
        return Double.parseDouble(line);
    }

    public static long readIntegerFromHDFSFile(String filename) throws IOException {
        BufferedReader br = setupInputFile(filename);
        String line = br.readLine();
        br.close();
        if (line == null)
            throw new IOException("Empty file on hdfs: " + filename);
        return Long.parseLong(line);
    }

    public static boolean readBooleanFromHDFSFile(String filename) throws IOException {
        BufferedReader br = setupInputFile(filename);
        String line = br.readLine();
        br.close();
        if (line == null)
            throw new IOException("Empty file on hdfs: " + filename);
        return Boolean.parseBoolean(line);
    }

    public static String readStringFromHDFSFile(String filename) throws IOException {
        BufferedReader br = setupInputFile(filename);
        // handle multi-line strings in the HDFS file
        StringBuilder sb = new StringBuilder();
        String line = null;
        while ((line = br.readLine()) != null) {
            sb.append(line);
            sb.append("\n");
        }
        br.close();

        //return string without last character
        return sb.substring(0, sb.length() - 1);
    }

    private static BufferedWriter setupOutputFile(String filename) throws IOException {
        Path pt = new Path(filename);
        FileSystem fs = FileSystem.get(_rJob);
        BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
        return br;
    }

    public static void writeDoubleToHDFS(double d, String filename) throws IOException {
        BufferedWriter br = setupOutputFile(filename);
        String line = "" + d;
        br.write(line);
        br.close();
    }

    public static void writeIntToHDFS(long i, String filename) throws IOException {
        BufferedWriter br = setupOutputFile(filename);
        String line = "" + i;
        br.write(line);
        br.close();
    }

    public static void writeBooleanToHDFS(boolean b, String filename) throws IOException {
        BufferedWriter br = setupOutputFile(filename);
        String line = "" + b;
        br.write(line);
        br.close();
    }

    public static void writeStringToHDFS(String s, String filename) throws IOException {
        BufferedWriter br = setupOutputFile(filename);
        String line = "" + s;
        br.write(line);
        br.close();
    }

    public static void writeDimsFile(String filename, byte[] unknownFlags, long[] maxRows, long[] maxCols)
            throws IOException {
        BufferedWriter br = setupOutputFile(filename);
        StringBuilder line = new StringBuilder();
        for (int i = 0; i < unknownFlags.length; i++) {
            if (unknownFlags[i] != (byte) 0) {
                line.append(i);
                line.append(" " + maxRows[i]);
                line.append(" " + maxCols[i]);
                line.append("\n");
            }
        }
        br.write(line.toString());
        br.close();
        //System.out.println("Finished writing dimsFile: " + filename);
    }

    public static MatrixCharacteristics[] processDimsFiles(String dir, MatrixCharacteristics[] stats)
            throws IOException {
        Path pt = new Path(dir);
        FileSystem fs = FileSystem.get(_rJob);

        if (!fs.exists(pt))
            return stats;

        FileStatus fstat = fs.getFileStatus(pt);

        if (fstat.isDirectory()) {
            FileStatus[] files = fs.listStatus(pt);
            for (int i = 0; i < files.length; i++) {
                Path filePath = files[i].getPath();
                //System.out.println("Processing dims file: " + filePath.toString());
                BufferedReader br = setupInputFile(filePath.toString());

                String line = "";
                while ((line = br.readLine()) != null) {
                    String[] parts = line.split(" ");
                    int resultIndex = Integer.parseInt(parts[0]);
                    long maxRows = Long.parseLong(parts[1]);
                    long maxCols = Long.parseLong(parts[2]);

                    stats[resultIndex].setDimension(
                            (stats[resultIndex].getRows() < maxRows ? maxRows : stats[resultIndex].getRows()),
                            (stats[resultIndex].getCols() < maxCols ? maxCols : stats[resultIndex].getCols()));
                }

                br.close();
            }
        } else {
            throw new IOException(dir + " is expected to be a folder!");
        }

        return stats;
    }

    public static void writeMetaDataFile(String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo)
            throws IOException {
        writeMetaDataFile(mtdfile, v, mc, outinfo, null);
    }

    public static void writeMetaDataFile(String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo,
            FileFormatProperties formatProperties) throws IOException {
        Path pt = new Path(mtdfile);
        FileSystem fs = FileSystem.get(_rJob);
        BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
        formatProperties = (formatProperties == null && outinfo == OutputInfo.CSVOutputInfo)
                ? new CSVFileFormatProperties()
                : formatProperties;

        String line = "";

        try {
            line += "{ \n" + "    \"" + DataExpression.DATATYPEPARAM + "\": \"matrix\"\n" + "    ,\""
                    + DataExpression.VALUETYPEPARAM + "\": ";

            switch (v) {
            case DOUBLE:
                line += "\"double\"\n";
                break;
            case INT:
                line += "\"int\"\n";
                break;
            case BOOLEAN:
                line += "\"boolean\"\n";
                break;
            case STRING:
                line += "\"string\"\n";
                break;
            case UNKNOWN:
                line += "\"unknown\"\n";
                break;
            case OBJECT:
                line += "\"object\"\n";
                break;
            }
            ;

            line += "    ,\"" + DataExpression.READROWPARAM + "\": " + mc.getRows() + "\n" + "    ,\""
                    + DataExpression.READCOLPARAM + "\": " + mc.getCols() + "\n";
            // only output rows_in_block and cols_in_block for binary format 
            if (outinfo == OutputInfo.BinaryBlockOutputInfo) {
                line += "    ,\"" + DataExpression.ROWBLOCKCOUNTPARAM + "\": " + mc.getRowsPerBlock() + "\n"
                        + "    ,\"" + DataExpression.COLUMNBLOCKCOUNTPARAM + "\": " + mc.getColsPerBlock() + "\n";
            }

            line += "    ,\"" + DataExpression.READNUMNONZEROPARAM + "\": " + mc.getNonZeros() + "\n" + "    ,\""
                    + DataExpression.FORMAT_TYPE + "\": ";

            if (outinfo == OutputInfo.TextCellOutputInfo) {
                line += "\"text\"\n";
            } else if (outinfo == OutputInfo.BinaryBlockOutputInfo || outinfo == OutputInfo.BinaryCellOutputInfo) {
                line += "\"binary\"\n"; // currently, there is no way to differentiate between them
            } else if (outinfo == OutputInfo.CSVOutputInfo) {
                line += "\"csv\"\n";
            } else {
                line += "\"specialized\"\n";
            }

            if (outinfo == OutputInfo.CSVOutputInfo) {
                CSVFileFormatProperties csvProperties = (CSVFileFormatProperties) formatProperties;
                line += "    ,\"" + DataExpression.DELIM_HAS_HEADER_ROW + "\": " + csvProperties.hasHeader() + "\n";
                line += "    ,\"" + DataExpression.DELIM_DELIMITER + "\": \"" + csvProperties.getDelim() + "\"\n";
            }

            line += "    ,\"description\": { \"author\": \"SystemML\" } \n" + "}";

            br.write(line);

            br.close();
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    public static void writeScalarMetaDataFile(String mtdfile, ValueType v) throws IOException {

        Path pt = new Path(mtdfile);
        FileSystem fs = FileSystem.get(_rJob);
        BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));

        try {
            String line = "";
            line += "{ \n" + "    \"" + DataExpression.DATATYPEPARAM + "\": \"scalar\"\n" + "    ,\""
                    + DataExpression.VALUETYPEPARAM + "\": ";

            switch (v) {
            case DOUBLE:
                line += "\"double\"\n";
                break;
            case INT:
                line += "\"int\"\n";
                break;
            case BOOLEAN:
                line += "\"boolean\"\n";
                break;
            case STRING:
                line += "\"string\"\n";
                break;
            case UNKNOWN:
                line += "\"unknown\"\n";
                break;
            case OBJECT:
                throw new IOException("Write of generic object types not supported.");
            }
            ;

            line += "    ,\"" + DataExpression.FORMAT_TYPE + "\": \"text\"\n"
                    + "    ,\"description\": { \"author\": \"SystemML\" } \n" + " }";

            br.write(line);

            br.close();
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    public static double[][] readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen,
            int bclen) throws IOException, DMLRuntimeException {
        MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo);
        MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen * clen);
        return DataConverter.convertToDoubleMatrix(mb);
    }

    public static double[] readColumnVectorFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen,
            int brlen, int bclen) throws IOException, DMLRuntimeException {
        MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo);
        MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen * clen);
        return DataConverter.convertToDoubleVector(mb);
    }

    public static double median(String dir, NumItemsByEachReducerMetaData metadata) throws IOException {
        long[] counts = metadata.getNumItemsArray();
        long[] ranges = new long[counts.length];
        ranges[0] = counts[0];
        for (int i = 1; i < counts.length; i++)
            ranges[i] = ranges[i - 1] + counts[i];

        long total = ranges[ranges.length - 1];

        return pickValueWeight(dir, metadata, 0.5, total % 2 == 0)[0];
    }

    public static double pickValue(String dir, NumItemsByEachReducerMetaData metadata, double p)
            throws IOException {
        return pickValueWeight(dir, metadata, p, false)[0];
    }

    public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p,
            boolean average) throws IOException {
        long[] counts = metadata.getNumItemsArray();
        long[] ranges = new long[counts.length];
        ranges[0] = counts[0];
        for (int i = 1; i < counts.length; i++)
            ranges[i] = ranges[i - 1] + counts[i];

        long total = ranges[ranges.length - 1];

        // do averaging only if it is asked for; and sum_wt is even
        average = average && (total % 2 == 0);

        int currentPart = 0;
        double cum_weight = 0;
        long pos = (long) Math.ceil(total * p);
        while (ranges[currentPart] < pos) {
            currentPart++;
            cum_weight += ranges[currentPart];
        }
        int offset;
        if (currentPart > 0)
            offset = (int) (pos - ranges[currentPart - 1] - 1);
        else
            offset = (int) pos - 1;

        FileSystem fs = FileSystem.get(_rJob);
        Path path = new Path(dir);
        FileStatus[] files = fs.listStatus(path);
        Path fileToRead = null;
        for (FileStatus file : files)
            if (file.getPath().toString().endsWith(Integer.toString(currentPart))) {
                fileToRead = file.getPath();
                break;
            }

        if (fileToRead == null)
            throw new RuntimeException("cannot read partition " + currentPart);

        FSDataInputStream currentStream = fs.open(fileToRead);
        DoubleWritable readKey = new DoubleWritable();
        IntWritable readValue = new IntWritable();

        boolean contain0s = false;
        long numZeros = 0;
        if (currentPart == metadata.getPartitionOfZero()) {
            contain0s = true;
            numZeros = metadata.getNumberOfZero();
        }
        ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros);

        int numRead = 0;
        while (numRead <= offset) {
            reader.readNextKeyValuePairs(readKey, readValue);
            numRead += readValue.get();
            cum_weight += readValue.get();
        }

        double ret = readKey.get();
        if (average) {
            if (numRead <= offset + 1) {
                reader.readNextKeyValuePairs(readKey, readValue);
                cum_weight += readValue.get();
                ret = (ret + readKey.get()) / 2;
            }
        }
        currentStream.close();
        return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) };
    }

    /**
     * 
     * @param name
     * @return
     */
    public static int extractNumberFromOutputFile(String name) {
        int i = name.indexOf("part-");
        assert (i >= 0);
        return Integer.parseInt(name.substring(i + 5));
    }

    /**
     * 
     * @param dir
     * @param permissions
     * @throws IOException
     */
    public static void createDirIfNotExistOnHDFS(String dir, String permissions) throws IOException {
        Path path = new Path(dir);
        try {
            FileSystem fs = FileSystem.get(_rJob);
            if (!fs.exists(path)) {
                char[] c = permissions.toCharArray();
                short sU = (short) ((c[0] - 48) * 64);
                short sG = (short) ((c[1] - 48) * 8);
                short sO = (short) ((c[2] - 48));
                short mode = (short) (sU + sG + sO);
                FsPermission perm = new FsPermission(mode);
                fs.mkdirs(path, perm);
            }
        } catch (Exception ex) {
            throw new IOException("Failed in creating a non existing dir on HDFS", ex);
        }

        //NOTE: we depend on the configured umask, setting umask in job or fspermission has no effect
        //similarly setting dfs.datanode.data.dir.perm as no effect either.
    }

    /**
     * 
     * @param filename
     * @param overwrite
     * @return
     * @throws IOException
     */
    public static FSDataOutputStream getHDFSDataOutputStream(String filename, boolean overwrite)
            throws IOException {
        FileSystem fs = FileSystem.get(_rJob);
        Path path = new Path(filename);
        return fs.create(path, overwrite);
    }
}