Example usage for org.apache.hadoop.fs FileSystem create

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem create.

Prototype

public FSDataOutputStream create(Path f, short replication) throws IOException

Source Link

Document

Create an FSDataOutputStream at the indicated Path.

Usage

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Convert input transformation specification file with column names into a
 * specification with corresponding column Ids. This file is sent to all the
 * relevant MR jobs./*from ww  w  .  j a v  a2  s  .co  m*/
 * 
 * @param fs
 * @param inputPath
 * @param smallestFile
 * @param colNames
 * @param prop
 * @param specFileWithNames
 * @return
 * @throws IllegalArgumentException
 * @throws IOException
 * @throws JSONException 
 */
private static String processSpecFile(FileSystem fs, String inputPath, String smallestFile,
        HashMap<String, Integer> colNames, CSVFileFormatProperties prop, String specFileWithNames)
        throws IllegalArgumentException, IOException, JSONException {
    // load input spec file with Names
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFileWithNames))));
    JSONObject inputSpec = JSONHelper.parse(br);
    br.close();

    final String NAME = "name";
    final String ID = "id";
    final String METHOD = "method";
    final String VALUE = "value";
    final String MV_METHOD_MEAN = "global_mean";
    final String MV_METHOD_MODE = "global_mode";
    final String MV_METHOD_CONSTANT = "constant";
    final String BIN_METHOD_WIDTH = "equi-width";
    final String BIN_METHOD_HEIGHT = "equi-height";
    final String SCALE_METHOD_Z = "z-score";
    final String SCALE_METHOD_M = "mean-subtraction";
    final String JSON_BYPOS = "ids";

    String stmp = null;
    JSONObject entry = null;
    byte btmp = 0;

    final int[] mvList;
    int[] rcdList, dcdList, omitList;
    final int[] binList;
    final int[] scaleList;
    byte[] mvMethods = null, binMethods = null, scaleMethods = null;
    Object[] numBins = null;
    Object[] mvConstants = null;

    boolean byPositions = (inputSpec.containsKey(JSON_BYPOS)
            && ((Boolean) inputSpec.get(JSON_BYPOS)).booleanValue() == true);

    // --------------------------------------------------------------------------
    // Omit
    if (inputSpec.containsKey(TX_METHOD.OMIT.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.OMIT.toString());
        omitList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                omitList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                omitList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(omitList);
    } else
        omitList = null;
    // --------------------------------------------------------------------------
    // Missing value imputation
    if (inputSpec.containsKey(TX_METHOD.IMPUTE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.IMPUTE.toString());

        mvList = new int[arrtmp.size()];
        mvMethods = new byte[arrtmp.size()];
        mvConstants = new Object[arrtmp.size()];

        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);
            if (byPositions) {
                mvList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                mvList[i] = colNames.get(stmp);
            }

            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(MV_METHOD_MEAN))
                btmp = (byte) 1;
            else if (stmp.equals(MV_METHOD_MODE))
                btmp = (byte) 2;
            else if (stmp.equals(MV_METHOD_CONSTANT))
                btmp = (byte) 3;
            else
                throw new IOException("Unknown missing value imputation method (" + stmp
                        + ") in transformation specification file: " + specFileWithNames);
            mvMethods[i] = btmp;

            //txMethods.add( btmp );

            mvConstants[i] = null;
            if (entry.containsKey(VALUE))
                mvConstants[i] = entry.get(VALUE);
        }

        Integer[] idx = new Integer[mvList.length];
        for (int i = 0; i < mvList.length; i++)
            idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return (mvList[o1] - mvList[o2]);
            }
        });

        // rearrange mvList, mvMethods, and mvConstants according to permutation idx
        inplacePermute(mvList, mvMethods, mvConstants, idx);
    } else
        mvList = null;
    // --------------------------------------------------------------------------
    // Recoding
    if (inputSpec.containsKey(TX_METHOD.RECODE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.RECODE.toString());
        rcdList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                rcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                rcdList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(rcdList);
    } else
        rcdList = null;
    // --------------------------------------------------------------------------
    // Binning
    if (inputSpec.containsKey(TX_METHOD.BIN.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.BIN.toString());

        binList = new int[arrtmp.size()];
        binMethods = new byte[arrtmp.size()];
        numBins = new Object[arrtmp.size()];

        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);

            if (byPositions) {
                binList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                binList[i] = colNames.get(stmp);
            }
            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(BIN_METHOD_WIDTH))
                btmp = (byte) 1;
            else if (stmp.equals(BIN_METHOD_HEIGHT))
                throw new IOException(
                        "Equi-height binning method is not yet supported, in transformation specification file: "
                                + specFileWithNames);
            else
                throw new IOException("Unknown missing value imputation method (" + stmp
                        + ") in transformation specification file: " + specFileWithNames);
            binMethods[i] = btmp;

            numBins[i] = entry.get(TransformationAgent.JSON_NBINS);
            if (((Integer) numBins[i]).intValue() <= 1)
                throw new IllegalArgumentException("Invalid transformation on column \""
                        + (String) entry.get(NAME) + "\". Number of bins must be greater than 1.");
        }

        Integer[] idx = new Integer[binList.length];
        for (int i = 0; i < binList.length; i++)
            idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return (binList[o1] - binList[o2]);
            }
        });

        // rearrange binList and binMethods according to permutation idx
        inplacePermute(binList, binMethods, numBins, idx);
    } else
        binList = null;
    // --------------------------------------------------------------------------
    // Dummycoding
    if (inputSpec.containsKey(TX_METHOD.DUMMYCODE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.DUMMYCODE.toString());
        dcdList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                dcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                dcdList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(dcdList);
    } else
        dcdList = null;
    // --------------------------------------------------------------------------
    // Scaling
    if (inputSpec.containsKey(TX_METHOD.SCALE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.SCALE.toString());

        scaleList = new int[arrtmp.size()];
        scaleMethods = new byte[arrtmp.size()];

        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);

            if (byPositions) {
                scaleList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                scaleList[i] = colNames.get(stmp);
            }
            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(SCALE_METHOD_M))
                btmp = (byte) 1;
            else if (stmp.equals(SCALE_METHOD_Z))
                btmp = (byte) 2;
            else
                throw new IOException("Unknown missing value imputation method (" + stmp
                        + ") in transformation specification file: " + specFileWithNames);
            scaleMethods[i] = btmp;
        }

        Integer[] idx = new Integer[scaleList.length];
        for (int i = 0; i < scaleList.length; i++)
            idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return (scaleList[o1] - scaleList[o2]);
            }
        });

        // rearrange scaleList and scaleMethods according to permutation idx
        inplacePermute(scaleList, scaleMethods, null, idx);
    } else
        scaleList = null;
    // --------------------------------------------------------------------------

    // check for column IDs that are imputed with mode, but not recoded
    // These columns have be handled separately, because the computation of mode 
    // requires the computation of distinct values (i.e., recode maps)
    ArrayList<Integer> tmpList = new ArrayList<Integer>();
    if (mvList != null)
        for (int i = 0; i < mvList.length; i++) {
            int colID = mvList[i];
            if (mvMethods[i] == 2 && (rcdList == null || Arrays.binarySearch(rcdList, colID) < 0))
                tmpList.add(colID);
        }

    int[] mvrcdList = null;
    if (tmpList.size() > 0) {
        mvrcdList = new int[tmpList.size()];
        for (int i = 0; i < tmpList.size(); i++)
            mvrcdList[i] = tmpList.get(i);
    }
    // Perform Validity Checks

    /*
       OMIT MVI RCD BIN DCD SCL
       OMIT     -  x   *   *   *   *
       MVI      x  -   *   *   *   *
       RCD      *  *   -   x   *   x
       BIN      *  *   x   -   *   x
       DCD      *  *   *   *   -   x
       SCL      *  *   x   x   x   -
     */

    if (mvList != null)
        for (int i = 0; i < mvList.length; i++) {
            int colID = mvList[i];

            if (omitList != null && Arrays.binarySearch(omitList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be both omitted and imputed.");

            if (mvMethods[i] == 1) {
                if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                            + ". A numeric column can not be recoded.");

                if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                    // throw an error only if the column is not binned
                    if (binList == null || Arrays.binarySearch(binList, colID) < 0)
                        throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                                + ". A numeric column can not be dummycoded.");
            }
        }

    if (scaleList != null)
        for (int i = 0; i < scaleList.length; i++) {
            int colID = scaleList[i];
            if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be recoded and scaled.");
            if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be binned and scaled.");
            if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be dummycoded and scaled.");
        }

    if (rcdList != null)
        for (int i = 0; i < rcdList.length; i++) {
            int colID = rcdList[i];
            if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be recoded and binned.");
        }

    // Check if dummycoded columns are either recoded or binned.
    // If not, add them to recode list.
    ArrayList<Integer> addToRcd = new ArrayList<Integer>();
    if (dcdList != null)
        for (int i = 0; i < dcdList.length; i++) {
            int colID = dcdList[i];
            boolean isRecoded = (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0);
            boolean isBinned = (binList != null && Arrays.binarySearch(binList, colID) >= 0);
            // If colID is neither recoded nor binned, then, add it to rcdList.
            if (!isRecoded && !isBinned)
                addToRcd.add(colID);
        }
    if (addToRcd.size() > 0) {
        int[] newRcdList = null;
        if (rcdList != null)
            newRcdList = Arrays.copyOf(rcdList, rcdList.length + addToRcd.size());
        else
            newRcdList = new int[addToRcd.size()];

        int i = (rcdList != null ? rcdList.length : 0);
        for (int idx = 0; i < newRcdList.length; i++, idx++)
            newRcdList[i] = addToRcd.get(idx);
        Arrays.sort(newRcdList);
        rcdList = newRcdList;
    }
    // -----------------------------------------------------------------------------

    // Prepare output spec
    JSONObject outputSpec = new JSONObject();

    if (omitList != null) {
        JSONObject rcdSpec = new JSONObject();
        rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(omitList));
        outputSpec.put(TX_METHOD.OMIT.toString(), rcdSpec);
    }

    if (mvList != null) {
        JSONObject mvSpec = new JSONObject();
        mvSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvList));
        mvSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(mvMethods));
        mvSpec.put(TransformationAgent.JSON_CONSTS, toJSONArray(mvConstants));
        outputSpec.put(TX_METHOD.IMPUTE.toString(), mvSpec);
    }

    if (rcdList != null) {
        JSONObject rcdSpec = new JSONObject();
        rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(rcdList));
        outputSpec.put(TX_METHOD.RECODE.toString(), rcdSpec);
    }

    if (binList != null) {
        JSONObject binSpec = new JSONObject();
        binSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(binList));
        binSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(binMethods));
        binSpec.put(TransformationAgent.JSON_NBINS, toJSONArray(numBins));
        outputSpec.put(TX_METHOD.BIN.toString(), binSpec);
    }

    if (dcdList != null) {
        JSONObject dcdSpec = new JSONObject();
        dcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(dcdList));
        outputSpec.put(TX_METHOD.DUMMYCODE.toString(), dcdSpec);
    }

    if (scaleList != null) {
        JSONObject scaleSpec = new JSONObject();
        scaleSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(scaleList));
        scaleSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(scaleMethods));
        outputSpec.put(TX_METHOD.SCALE.toString(), scaleSpec);
    }

    if (mvrcdList != null) {
        JSONObject mvrcd = new JSONObject();
        mvrcd.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvrcdList));
        outputSpec.put(TX_METHOD.MVRCD.toString(), mvrcd);
    }

    // write out the spec with IDs
    String specFileWithIDs = MRJobConfiguration.constructTempOutputFilename();
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(specFileWithIDs), true)));
    out.write(outputSpec.toString());
    out.close();

    return specFileWithIDs;
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Main method to create and/or apply transformation metdata in-memory, on a single node.
 * //from  w  ww  . j av a 2  s  .c o m
 * @param job
 * @param fs
 * @param inputPath
 * @param ncols
 * @param prop
 * @param specFileWithIDs
 * @param tfMtdPath
 * @param applyTxPath
 * @param isApply
 * @param outputPath
 * @param headerLine
 * @throws IOException
 * @throws DMLRuntimeException 
 * @throws JSONException 
 * @throws IllegalArgumentException 
 */
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols,
        CSVFileFormatProperties prop, String specFileWithIDs, String tfMtdPath, boolean isApply,
        MatrixObject result, String headerLine, boolean isBB, boolean isCSV)
        throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {

    String[] na = TfUtils.parseNAStrings(prop.getNAStrings());

    JSONObject spec = TfUtils.readSpec(fs, specFileWithIDs);
    TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath,
            null, null);

    MVImputeAgent _mia = agents.getMVImputeAgent();
    RecodeAgent _ra = agents.getRecodeAgent();
    BinAgent _ba = agents.getBinAgent();
    DummycodeAgent _da = agents.getDummycodeAgent();

    // List of files to read
    ArrayList<Path> files = collectInputFiles(inputPath, fs);

    // ---------------------------------
    // Construct transformation metadata
    // ---------------------------------

    String line = null;
    String[] words = null;

    int numColumnsTf = 0;
    BufferedReader br = null;

    if (!isApply) {
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
            if (fileNo == 0 && prop.hasHeader())
                br.readLine(); //ignore header

            line = null;
            while ((line = br.readLine()) != null) {
                agents.prepareTfMtd(line);
            }
            br.close();
        }

        if (agents.getValid() == 0)
            throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);

        _mia.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ba.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ra.outputTransformationMetadata(tfMtdPath, fs, agents);

        // prepare agents for the subsequent phase of applying transformation metadata

        // NO need to loadTxMtd for _ra, since the maps are already present in the memory
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);

        _da.setRecodeMapsCP(_ra.getCPRecodeMaps());
        _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    } else {
        // Count the number of rows
        int rows[] = countNumRows(files, prop, fs, agents);
        agents.setTotal(rows[0]);
        agents.setValid(rows[1]);

        if (agents.getValid() == 0)
            throw new DMLRuntimeException(
                    "Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");

        // Load transformation metadata
        // prepare agents for the subsequent phase of applying transformation metadata
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ra.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);

        _da.setRecodeMaps(_ra.getRecodeMaps());
        _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    }

    // -----------------------------
    // Apply transformation metadata
    // -----------------------------

    numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);

    MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
    BufferedWriter out = new BufferedWriter(
            new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
    StringBuilder sb = new StringBuilder();

    MatrixBlock mb = null;
    if (isBB) {
        int estNNZ = (int) agents.getValid() * ncols;
        mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);

        if (mb.isInSparseFormat())
            mb.allocateSparseRowsBlock();
        else
            mb.allocateDenseBlock();
    }

    int rowID = 0; // rowid to be used in filling the matrix block

    for (int fileNo = 0; fileNo < files.size(); fileNo++) {
        br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
        if (fileNo == 0) {
            String header = null;
            if (prop.hasHeader())
                br.readLine(); // ignore the header line from data file

            header = headerLine;
            String dcdHeader = _da.constructDummycodedHeader(header, agents.getDelim());
            numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
            DataTransform.generateHeaderFiles(fs, tfMtdPath, header, dcdHeader);
        }

        line = null;
        while ((line = br.readLine()) != null) {
            words = agents.getWords(line);

            if (!agents.omit(words)) {
                words = agents.apply(words, !isApply);

                if (isCSV) {
                    out.write(agents.checkAndPrepOutputString(words, sb));
                    out.write("\n");
                }

                if (isBB) {
                    agents.check(words);
                    for (int c = 0; c < words.length; c++) {
                        if (words[c] == null || words[c].isEmpty())
                            ;
                        else
                            mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
                    }
                }
                rowID++;
            }
        }
        br.close();
    }
    out.close();

    if (mb != null) {
        mb.recomputeNonZeros();
        mb.examSparsity();

        result.acquireModify(mb);
        result.release();
        result.exportData();
    }

    MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf,
            (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
    JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);

    return ret;
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

public static void generateHeaderFiles(FileSystem fs, String txMtdDir, String origHeader, String newHeader)
        throws IOException {
    // write out given header line
    Path pt = new Path(txMtdDir + "/" + TransformationAgent.OUT_HEADER);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
    br.write(origHeader + "\n");
    br.close();// w ww.ja  v a  2s  . c om

    // write out the new header line (after all transformations)
    pt = new Path(txMtdDir + "/" + TransformationAgent.OUT_DCD_HEADER);
    br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
    br.write(newHeader + "\n");
    br.close();
}

From source file:com.ibm.bi.dml.runtime.transform.DummycodeAgent.java

License:Open Source License

/**
 * Method to generate dummyCodedMaps.csv, with the range of column IDs for each variable in the original data.
 * /*from  ww w.  j  av a 2s.c o  m*/
 * Each line in dummyCodedMaps.csv file is of the form: [ColID, 1/0, st, end]
 *       1/0 indicates if ColID is dummycoded or not
 *       [st,end] is the range of dummycoded column numbers for the given ColID
 * 
 * It also generates coltypes.csv, with the type (scale, nominal, etc.) of columns in the output.
 * Recoded columns are of type nominal, binner columns are of type ordinal, dummycoded columns are of type 
 * dummycoded, and the remaining are of type scale.
 * 
 * @param fs
 * @param txMtdDir
 * @param numCols
 * @param ra
 * @param ba
 * @return Number of columns in the transformed data
 * @throws IOException
 */
public int genDcdMapsAndColTypes(FileSystem fs, String txMtdDir, int numCols, TfUtils agents)
        throws IOException {

    // initialize all column types in the transformed data to SCALE
    ColumnTypes[] ctypes = new ColumnTypes[(int) _dummycodedLength];
    for (int i = 0; i < _dummycodedLength; i++)
        ctypes[i] = ColumnTypes.SCALE;

    _dcdColumnMap = new int[numCols];

    Path pt = new Path(txMtdDir + "/Dummycode/" + DCD_FILE_NAME);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));

    int sum = 1;
    int idx = 0;
    for (int colID = 1; colID <= numCols; colID++) {
        if (_dcdList != null && idx < _dcdList.length && _dcdList[idx] == colID) {
            br.write(colID + "," + "1" + "," + sum + "," + (sum + _domainSizes[idx] - 1) + "\n");
            _dcdColumnMap[colID - 1] = (sum + _domainSizes[idx] - 1) - 1;

            for (int i = sum; i <= (sum + _domainSizes[idx] - 1); i++)
                ctypes[i - 1] = ColumnTypes.DUMMYCODED;

            sum += _domainSizes[idx];
            idx++;
        } else {
            br.write(colID + "," + "0" + "," + sum + "," + sum + "\n");
            _dcdColumnMap[colID - 1] = sum - 1;

            if (agents.getBinAgent().isBinned(colID) != -1)
                ctypes[sum - 1] = ColumnTypes.ORDINAL; // binned variable results in an ordinal column

            if (agents.getRecodeAgent().isRecoded(colID) != -1)
                ctypes[sum - 1] = ColumnTypes.NOMINAL;

            sum += 1;
        }
    }
    br.close();

    // Write coltypes.csv
    pt = new Path(txMtdDir + "/" + COLTYPES_FILE_NAME);
    br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));

    br.write(columnTypeToID(ctypes[0]) + "");
    for (int i = 1; i < _dummycodedLength; i++)
        br.write("," + columnTypeToID(ctypes[i]));
    br.close();

    return sum - 1;
}

From source file:com.ibm.bi.dml.runtime.transform.MVImputeAgent.java

License:Open Source License

private void writeTfMtd(int colID, String mean, String tfMtdDir, FileSystem fs, TfUtils agents)
        throws IOException {
    Path pt = new Path(tfMtdDir + "/Impute/" + agents.getName(colID) + MV_FILE_SUFFIX);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
    br.write(colID + TXMTD_SEP + mean + "\n");
    br.close();/*from w ww  .j  a  v a  2  s  .  c  o m*/
}

From source file:com.ibm.bi.dml.runtime.transform.MVImputeAgent.java

License:Open Source License

private void writeTfMtd(int colID, String mean, String sdev, String tfMtdDir, FileSystem fs, TfUtils agents)
        throws IOException {
    Path pt = new Path(tfMtdDir + "/Scale/" + agents.getName(colID) + SCALE_FILE_SUFFIX);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
    br.write(colID + TXMTD_SEP + mean + TXMTD_SEP + sdev + "\n");
    br.close();//from w  w  w .  j  a  v a 2s.  c o  m
}

From source file:com.ibm.bi.dml.runtime.transform.RecodeAgent.java

License:Open Source License

/**
 * Function to output transformation metadata, including: 
 * - recode maps, /*from   www. j  av a 2 s  .  c  om*/
 * - number of distinct values, 
 * - mode, and 
 * - imputation value (in the case of global_mode)
 * 
 * The column for which this function is invoked can be one of the following:
 * - just recoded                  (write .map, .ndistinct, .mode)
 * - just mv imputed (w/ global_mode)   (write .impute)
 * - both recoded and mv imputed      (write .map, .ndistinct, .mode, .impute)
 * 
 * @param map
 * @param outputDir
 * @param colID
 * @param fs
 * @param mvagent
 * @throws IOException
 */
private void writeMetadata(HashMap<String, Long> map, String outputDir, int colID, FileSystem fs,
        TfUtils agents, boolean fromCP) throws IOException {
    // output recode maps and mode

    MVImputeAgent mvagent = agents.getMVImputeAgent();
    String mode = null;
    Long count = null;
    int rcdIndex = 0, modeIndex = 0;
    long maxCount = Long.MIN_VALUE;

    boolean isRecoded = (isRecoded(colID) != -1);
    boolean isModeImputed = (mvagent.getMethod(colID) == MVMethod.GLOBAL_MODE);

    Path pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + RCD_MAP_FILE_SUFFIX);
    BufferedWriter br = null;
    if (isRecoded)
        br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));

    // remove NA strings
    if (agents.getNAStrings() != null)
        for (String naword : agents.getNAStrings())
            map.remove(naword);

    if (fromCP)
        map = handleMVConstant(colID, agents, map);

    if (map.size() == 0)
        throw new RuntimeException("Can not proceed since \"" + agents.getName(colID) + "\" (id=" + colID
                + ") contains only the missing values, and not a single valid value -- set imputation method to \"constant\".");

    // Order entries by category (string) value
    Ordering<String> valueComparator = Ordering.natural();
    List<String> newNames = valueComparator.sortedCopy(map.keySet());

    for (String w : newNames) { //map.keySet()) {
        count = map.get(w);
        ++rcdIndex;

        // output (w, count, rcdIndex)
        if (br != null)
            br.write(UtilFunctions.quote(w) + TXMTD_SEP + rcdIndex + TXMTD_SEP + count + "\n");

        if (maxCount < count) {
            maxCount = count;
            mode = w;
            modeIndex = rcdIndex;
        }

        // Replace count with recode index (useful when invoked from CP)
        map.put(w, (long) rcdIndex);
    }

    if (br != null)
        br.close();

    if (mode == null) {
        mode = "";
        maxCount = 0;
    }

    if (isRecoded) {
        // output mode
        pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + MODE_FILE_SUFFIX);
        br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
        br.write(UtilFunctions.quote(mode) + "," + modeIndex + "," + maxCount);
        br.close();

        // output number of distinct values
        pt = new Path(outputDir + "/Recode/" + agents.getName(colID) + NDISTINCT_FILE_SUFFIX);
        br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
        br.write("" + map.size());
        br.close();
    }

    if (isModeImputed) {
        pt = new Path(outputDir + "/Impute/" + agents.getName(colID) + MV_FILE_SUFFIX);
        br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
        br.write(colID + "," + UtilFunctions.quote(mode));
        br.close();
    }

}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

private static BufferedWriter setupOutputFile(String filename) throws IOException {
    Path pt = new Path(filename);
    FileSystem fs = FileSystem.get(_rJob);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
    return br;// ww  w  . j a  v  a 2s .c o  m
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

public static void writeMetaDataFile(String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo,
        FileFormatProperties formatProperties) throws IOException {
    Path pt = new Path(mtdfile);
    FileSystem fs = FileSystem.get(_rJob);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
    formatProperties = (formatProperties == null && outinfo == OutputInfo.CSVOutputInfo)
            ? new CSVFileFormatProperties()
            : formatProperties;/*from w w  w  .j  a  va  2  s . com*/

    String line = "";

    try {
        line += "{ \n" + "    \"" + DataExpression.DATATYPEPARAM + "\": \"matrix\"\n" + "    ,\""
                + DataExpression.VALUETYPEPARAM + "\": ";

        switch (v) {
        case DOUBLE:
            line += "\"double\"\n";
            break;
        case INT:
            line += "\"int\"\n";
            break;
        case BOOLEAN:
            line += "\"boolean\"\n";
            break;
        case STRING:
            line += "\"string\"\n";
            break;
        case UNKNOWN:
            line += "\"unknown\"\n";
            break;
        case OBJECT:
            line += "\"object\"\n";
            break;
        }
        ;

        line += "    ,\"" + DataExpression.READROWPARAM + "\": " + mc.getRows() + "\n" + "    ,\""
                + DataExpression.READCOLPARAM + "\": " + mc.getCols() + "\n";
        // only output rows_in_block and cols_in_block for binary format 
        if (outinfo == OutputInfo.BinaryBlockOutputInfo) {
            line += "    ,\"" + DataExpression.ROWBLOCKCOUNTPARAM + "\": " + mc.getRowsPerBlock() + "\n"
                    + "    ,\"" + DataExpression.COLUMNBLOCKCOUNTPARAM + "\": " + mc.getColsPerBlock() + "\n";
        }

        line += "    ,\"" + DataExpression.READNUMNONZEROPARAM + "\": " + mc.getNonZeros() + "\n" + "    ,\""
                + DataExpression.FORMAT_TYPE + "\": ";

        if (outinfo == OutputInfo.TextCellOutputInfo) {
            line += "\"text\"\n";
        } else if (outinfo == OutputInfo.BinaryBlockOutputInfo || outinfo == OutputInfo.BinaryCellOutputInfo) {
            line += "\"binary\"\n"; // currently, there is no way to differentiate between them
        } else if (outinfo == OutputInfo.CSVOutputInfo) {
            line += "\"csv\"\n";
        } else {
            line += "\"specialized\"\n";
        }

        if (outinfo == OutputInfo.CSVOutputInfo) {
            CSVFileFormatProperties csvProperties = (CSVFileFormatProperties) formatProperties;
            line += "    ,\"" + DataExpression.DELIM_HAS_HEADER_ROW + "\": " + csvProperties.hasHeader() + "\n";
            line += "    ,\"" + DataExpression.DELIM_DELIMITER + "\": \"" + csvProperties.getDelim() + "\"\n";
        }

        line += "    ,\"description\": { \"author\": \"SystemML\" } \n" + "}";

        br.write(line);

        br.close();
    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

public static void writeScalarMetaDataFile(String mtdfile, ValueType v) throws IOException {

    Path pt = new Path(mtdfile);
    FileSystem fs = FileSystem.get(_rJob);
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));

    try {//from  www .  j a v a 2 s .c o m
        String line = "";
        line += "{ \n" + "    \"" + DataExpression.DATATYPEPARAM + "\": \"scalar\"\n" + "    ,\""
                + DataExpression.VALUETYPEPARAM + "\": ";

        switch (v) {
        case DOUBLE:
            line += "\"double\"\n";
            break;
        case INT:
            line += "\"int\"\n";
            break;
        case BOOLEAN:
            line += "\"boolean\"\n";
            break;
        case STRING:
            line += "\"string\"\n";
            break;
        case UNKNOWN:
            line += "\"unknown\"\n";
            break;
        case OBJECT:
            throw new IOException("Write of generic object types not supported.");
        }
        ;

        line += "    ,\"" + DataExpression.FORMAT_TYPE + "\": \"text\"\n"
                + "    ,\"description\": { \"author\": \"SystemML\" } \n" + " }";

        br.write(line);

        br.close();
    } catch (Exception e) {
        throw new IOException(e);
    }
}