Example usage for org.apache.hadoop.fs Path toString

List of usage examples for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * //from ww w  . j  av a 2 s.c  o  m
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
@SuppressWarnings("unchecked")
public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
        // simply move srcFile to destFile

        /*
         * TODO: Remove this roundabout way! 
         * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv 
         *              & the only path that exists already on HDFS is /user/biadmin/csv/.
         * In this case: the directory structure /user/biadmin/csv/temp/out must be created. 
         * Simple hdfs.rename() does not seem to create this directory structure.
         */

        // delete the destination file, if exists already
        //boolean ret1 = 
        hdfs.delete(destFilePath, true);

        // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
        //boolean ret2 = 
        hdfs.createNewFile(destFilePath);

        // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
        //boolean ret3 = 
        hdfs.delete(destFilePath, true);

        // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
        //boolean ret4 = 
        hdfs.rename(srcFilePath, destFilePath);

        //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4);
        return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1)
            sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

        // compute sorted order among part files
        ArrayList<Path> files = new ArrayList<Path>();
        for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);

        // first part file path
        Path firstpart = files.get(0);

        // create a temp file, and add header and contents of first part
        Path tmp = new Path(firstpart.toString() + ".tmp");
        OutputStream out = hdfs.create(tmp, true);
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy rest of the data from firstpart
        InputStream in = null;
        try {
            in = hdfs.open(firstpart);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }

        // rename tmp to firstpart
        hdfs.delete(firstpart, true);
        hdfs.rename(tmp, firstpart);

        // rename srcfile to destFile
        hdfs.delete(destFilePath, true);
        hdfs.createNewFile(destFilePath); // force the creation of directory structure
        hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
        hdfs.rename(srcFilePath, destFilePath); // move the data 

    } else if (hdfs.isFile(srcFilePath)) {
        // create destination file
        OutputStream out = hdfs.create(destFilePath, true);

        // write header
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy the data from srcFile
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSVParallel.java

License:Open Source License

/**
 * //from  w  ww .java  2s.c  o  m
 * @param fileName
 * @param src
 * @param rlen
 * @param clen
 * @param nnz
 * @throws IOException
 */
@Override
protected void writeCSVMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen, long nnz,
        CSVFileFormatProperties props) throws IOException {
    //estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(),
            src.getNonZeros(), OutputInfo.CSVOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);

    //determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);

    //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeCSVMatrixToHDFS(path, job, src, rlen, clen, nnz, props);
        return;
    }

    //create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);

    //create and execute tasks
    try {
        ExecutorService pool = Executors.newFixedThreadPool(numThreads);
        ArrayList<WriteCSVTask> tasks = new ArrayList<WriteCSVTask>();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, String.format("0-m-%05d", i));
            tasks.add(new WriteCSVTask(newPath, job, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen),
                    props));
        }

        //wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();

        //check for exceptions 
        for (Future<Object> task : rt)
            task.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel write of csv output.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.CleanupMR.java

License:Open Source License

/**
 * //from   w  ww .ja  v  a 2 s .c  o  m
 * @param path
 * @param numTasks
 * @throws DMLRuntimeException
 * @throws IOException
 */
private static void writeCleanupTasksToFile(Path path, int numTasks) throws DMLRuntimeException, IOException {
    BufferedWriter br = null;
    try {
        FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
        br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

        for (int i = 1; i <= numTasks; i++)
            br.write(String.valueOf("CLEANUP TASK " + i) + "\n");
    } catch (Exception ex) {
        throw new DMLRuntimeException("Error writing cleanup tasks to taskfile " + path.toString(), ex);
    } finally {
        if (br != null)
            br.close();
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java

License:Open Source License

private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos,
        long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions,
        String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes,
        String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception {
    JobConf job;// w  w  w  .j  a va 2 s .co  m
    job = new JobConf(ReblockMR.class);
    job.setJobName("CSV-Reblock-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
            ConvertTarget.CELL);

    job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            reblockInstructions, null, otherInstructionsInReducer, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes,
            false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);

    // Print the complete instruction
    //if (LOG.isTraceEnabled())
    //   inst.printCompelteMRJobInstruction(stats);

    // Update resultDimsUnknown based on computed "stats"
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            resultDimsUnknown[i] = (byte) 1;
        } else {
            resultDimsUnknown[i] = (byte) 0;
        }
    }

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true,
            true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVReblockMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    Path cachefile = new Path(counterFile, "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    job.set(ROWID_FILE_NAME, cachefile.toString());

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        //   System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} to the list of
 * inputs for the map-reduce job.//ww  w  . j a v a2  s . c o m
 * 
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass) {

    String inputFormatMapping = path.toString() + ";" + inputFormatClass.getName();
    String inputFormats = conf.get("mapred.input.dir.formats");
    conf.set("mapred.input.dir.formats",
            inputFormats == null ? inputFormatMapping : inputFormats + "," + inputFormatMapping);

    conf.setInputFormat(DelegatingInputFormat.class);
}

From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 * //from ww w .ja  v a2s .c o  m
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 * @param mapperClass {@link Mapper} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass,
        Class<? extends Mapper> mapperClass) {

    addInputPath(conf, path, inputFormatClass);

    String mapperMapping = path.toString() + ";" + mapperClass.getName();
    String mappers = conf.get("mapred.input.dir.mappers");
    conf.set("mapred.input.dir.mappers", mappers == null ? mapperMapping : mappers + "," + mapperMapping);

    conf.setMapperClass(DelegatingMapper.class);
}

From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java

License:Open Source License

@Override
public void setupJob(JobContext context) throws IOException {
    super.setupJob(context);
    // get output file directories and create directories
    JobConf conf = context.getJobConf();
    String[] loutputs = MRJobConfiguration.getOutputs(conf);
    for (String dir : loutputs) {
        Path path = new Path(dir);
        FileSystem fs = path.getFileSystem(conf);
        if (!fs.mkdirs(path))
            LOG.error("Mkdirs failed to create " + path.toString());
    }//from  w ww  .  j  av a2  s .  co m
}

From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java

License:Open Source License

@Override
public void commitTask(TaskAttemptContext context) throws IOException {
    JobConf conf = context.getJobConf();
    TaskAttemptID attemptId = context.getTaskAttemptID();

    // get the mapping between index to output filename
    outputs = MRJobConfiguration.getOutputs(conf);

    //get temp task output path (compatible with hadoop1 and hadoop2)
    Path taskOutPath = FileOutputFormat.getWorkOutputPath(conf);
    FileSystem fs = taskOutPath.getFileSystem(conf);
    if (!fs.exists(taskOutPath))
        throw new IOException("Task output path " + taskOutPath.toString() + "does not exist.");

    // Move the task outputs to their final places
    context.getProgressible().progress();
    moveFinalTaskOutputs(context, fs, taskOutPath);

    // Delete the temporary task-specific output directory
    if (!fs.delete(taskOutPath, true))
        LOG.debug(//www  . jav a2  s  .  co m
                "Failed to delete the temporary output directory of task: " + attemptId + " - " + taskOutPath);
}

From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVAssignRowIDMapper.java

License:Open Source License

@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
    byte thisIndex;
    try {/*from  w  w  w. ja  va 2s. co m*/
        //it doesn't make sense to have repeated file names in the input, since this is for reblock
        thisIndex = MRJobConfiguration.getInputMatrixIndexesInMapper(job).get(0);
        outKey.set(thisIndex);
        FileSystem fs = FileSystem.get(job);
        Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs);
        filename = thisPath.toString();
        String[] strs = job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT);
        Path headerPath = new Path(strs[thisIndex]).makeQualified(fs);
        if (headerPath.toString().equals(filename))
            headerFile = true;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    try {
        CSVReblockInstruction[] reblockInstructions = MRJobConfiguration.getCSVReblockInstructions(job);
        for (CSVReblockInstruction ins : reblockInstructions) {
            if (ins.input == thisIndex) {
                delim = Pattern.quote(ins.delim);
                ignoreFirstLine = ins.hasHeader;
                break;
            }
        }
    } catch (DMLUnsupportedOperationException e) {
        throw new RuntimeException(e);
    } catch (DMLRuntimeException e) {
        throw new RuntimeException(e);
    }

    // load properties relevant to transform
    try {
        boolean omit = job.getBoolean(MRJobConfiguration.TF_TRANSFORM, false);
        if (omit)
            _agents = new TfUtils(job, true);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } catch (JSONException e) {
        throw new RuntimeException(e);
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVReblockMapper.java

License:Open Source License

@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
    super.configure(job);
    //get the number colums per block

    //load the offset mapping
    byte matrixIndex = representativeMatrixes.get(0);
    try {//from   w  w w.  ja  va2 s. co  m
        FileSystem fs = FileSystem.get(job);
        Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs);
        String filename = thisPath.toString();
        Path headerPath = new Path(job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT)[matrixIndex])
                .makeQualified(fs);
        if (headerPath.toString().equals(filename))
            headerFile = true;

        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, job);
        while (reader.next(key, value)) {
            if (key.get() == matrixIndex && filename.equals(value.filename))
                offsetMap.put(value.fileOffset, value.count);
        }
        reader.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0);
    _delim = ins.delim;
    ignoreFirstLine = ins.hasHeader;

    idxRow = new IndexedBlockRow();
    int maxBclen = 0;

    for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions)
        for (CSVReblockInstruction in : insv) {
            if (maxBclen < in.bclen)
                maxBclen = in.bclen;
        }

    //always dense since common csv usecase
    idxRow.getRow().data.reset(1, maxBclen, false);
}