Example usage for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:com.ibm.bi.dml.runtime.matrix.mapred.MMCJMRCache.java

License:Open Source License

/**
 * // w  w  w .  j a v a 2s .co m
 * @throws IOException
 */
protected void loadBuffer() throws IOException {
    _bufferSize = 0;
    if (_bufferMap != null)
        _bufferMap.clear();

    Path path = getFilePath(_fileCursor);

    if (_fs.exists(path)) {
        //System.out.println( "load buffer: "+path.toString() );
        _bufferSize = LocalFileUtils.readBlockSequenceFromLocal(path.toString(), _buffer, _bufferMap);
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.mapred.MMCJMRCache.java

License:Open Source License

/**
 * /*from  w w  w . j  a  v a  2s. co  m*/
 * @throws IOException
 */
protected void writeBuffer() throws IOException {
    if (_fileCursor < 0 || _bufferSize <= 0)
        return;

    //the old file will be overwritten
    Path path = getFilePath(_fileCursor);
    //System.out.println( "write buffer: "+path.toString() );
    LocalFileUtils.writeBlockSequenceToLocal(path.toString(), _buffer, _bufferSize);
}

From source file:com.ibm.bi.dml.runtime.matrix.mapred.PartialAggregator.java

License:Open Source License

/**
 * /*from   w w w .  j av a2s .com*/
 * @param path
 * @param outputs
 * @param j
 * @param reporter
 * @return
 * @throws IOException
 */
private long copyFileContentAndDelete(Path path, CollectMultipleConvertedOutputs outputs, int j,
        Reporter reporter) throws IOException {
    long nonZeros = 0;
    if (_fs.exists(path)) {
        _bufferSize = LocalFileUtils.readBlockSequenceFromLocal(path.toString(), _buffer, _bufferMap);
        for (int i = 0; i < _bufferSize; i++) {
            outputs.collectOutput(_buffer[i].getKey(), _buffer[i].getValue(), j, reporter);
            nonZeros += _buffer[i].getValue().getNonZeros();
        }
        MapReduceTool.deleteFileIfExistOnHDFS(path, _job);
    }
    return nonZeros;
}

From source file:com.ibm.bi.dml.runtime.matrix.SortMR.java

License:Open Source License

@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen,
        int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication,
        String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception {
    boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes;
    String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output;

    JobConf job = new JobConf(SortMR.class);
    job.setJobName("SortMR");

    //setup partition file
    String pfname = MRJobConfiguration.setUpSortPartitionFilename(job);
    Path partitionFile = new Path(pfname);
    URI partitionUri = new URI(partitionFile.toString());

    //setup input/output paths
    Path inputDir = new Path(input);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    SamplingSortMRInputFormat.setInputPaths(job, inputDir);
    Path outpath = new Path(tmpOutput);
    FileOutputFormat.setOutputPath(job, outpath);
    MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);

    //set number of reducers (1 if local mode)
    if (InfrastructureAnalyzer.isLocalMode(job))
        job.setNumReduceTasks(1);/* www  . j av  a2 s  .c om*/
    else
        MRJobConfiguration.setNumReducers(job, numReducers, numReducers);

    //setup input/output format
    job.setInputFormat(SamplingSortMRInputFormat.class);
    SamplingSortMRInputFormat.setTargetKeyValueClasses(job,
            (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass);

    //setup instructions and meta information
    if (combineInst != null && !combineInst.trim().isEmpty())
        job.set(COMBINE_INSTRUCTION, combineInst);
    job.set(SORT_INSTRUCTION, sortInst);
    job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
    boolean desc = getSortInstructionDescending(sortInst);
    job.setBoolean(SORT_DECREASING, desc);
    MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
    MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
    int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);

    //setup mapper/reducer/partitioner/output classes
    if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) {
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        job.setMapperClass(IndexSortMapper.class);
        job.setReducerClass(IndexSortReducer.class);
        job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setOutputKeyClass(MatrixIndexes.class);
        job.setOutputValueClass(MatrixBlock.class);
    } else { //default case: SORT w/wo weights
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(CompactOutputFormat.class);
        job.setMapperClass(ValueSortMapper.class);
        job.setReducerClass(ValueSortReducer.class);
        job.setOutputKeyClass(outputInfo.outputKeyClass); //double
        job.setOutputValueClass(outputInfo.outputValueClass); //int
    }
    job.setPartitionerClass(TotalOrderPartitioner.class);

    //setup distributed cache
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);

    //setup replication factor
    job.setInt("dfs.replication", replication);

    MatrixCharacteristics[] s = new MatrixCharacteristics[1];
    s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(s);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //run mr job
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
    numReducers = job.getNumReduceTasks();

    //process final meta data
    long[] counts = new long[numReducers];
    long total = 0;
    for (int i = 0; i < numReducers; i++) {
        counts[i] = group.getCounter(Integer.toString(i));
        total += counts[i];
    }

    //add missing 0s back to the results
    long missing0s = 0;
    if (total < rlen * clen) {
        if (partitionWith0 < 0)
            throw new RuntimeException("no partition contains 0, which is wrong!");
        missing0s = rlen * clen - total;
        counts[partitionWith0] += missing0s;
    } else
        partitionWith0 = -1;

    if (sortIndexes) {
        //run builtin job for shifting partially sorted blocks according to global offsets
        //we do this in this custom form since it would not fit into the current structure
        //of systemml to output two intermediates (partially sorted data, offsets) out of a 
        //single SortKeys lop
        boolean success = runjob.isSuccessful();
        if (success) {
            success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication,
                    output);
        }
        MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput);
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success);
    } else {
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful());
    }
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMapper.java

License:Open Source License

@Override
public void configure(JobConf job) {
    super.configure(job);
    try {/*from w  ww . j a v  a 2  s.  c  om*/
        _partFileWithHeader = TfUtils.isPartFileWithHeader(job);
        tfmapper = new TfUtils(job);
        tfmapper.loadTfMetadata(job, true);

        // Load relevant information for CSV Reblock
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));

        FileSystem fs = FileSystem.get(job);
        Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs);
        String thisfile = thisPath.toString();

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, job);
        while (reader.next(key, value)) {
            // "key" needn't be checked since the offset file has information about a single CSV input (the raw data file)
            if (thisfile.equals(value.filename))
                offsetMap.put(value.fileOffset, value.count);
        }
        reader.close();

        idxRow = new CSVReblockMapper.IndexedBlockRow();
        int maxBclen = 0;

        for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions)
            for (CSVReblockInstruction in : insv) {
                if (maxBclen < in.bclen)
                    maxBclen = in.bclen;
            }

        //always dense since common csv usecase
        idxRow.getRow().data.reset(1, maxBclen, false);

    } catch (IOException e) {
        throw new RuntimeException(e);
    } catch (JSONException e) {
        throw new RuntimeException(e);
    }

}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMR.java

License:Open Source License

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String specPath,
        String mapsPath, String tmpPath, String outputPath, String partOffsetsFile,
        CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter,
        int replication, String headerLine) throws Exception {

    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);

    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };

    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);

    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath },
            new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);

    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);

    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            rblkInst, null, otherInst, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);/*from   w w  w .jav a  2 s  .c  om*/

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 },
            new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specPath);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(ret.stats, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java

License:Open Source License

public static JobReturn runJob(String inputPath, String specPath, String mapsPath, String tmpPath,
        String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols,
        int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(ApplyTfCSVMR.class);
    job.setJobName("ApplyTfCSV");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfCSVMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfCSVMapper.class);
    job.setNumReduceTasks(0);/*from  www  .j  a  v a2s  .c om*/

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(partOffsetsFile);
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInt("dfs.replication", replication);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(outputPath);
    FileSystem fs = FileSystem.get(job);
    fs.delete(outPath, true);
    FileOutputFormat.setOutputPath(job, outPath);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specPath);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    // Run the job
    RunningJob runjob = JobClient.runJob(job);

    // Since transform CSV produces part files w/ prefix transform-part-*,
    // delete all the "default" part-..... files
    deletePartFiles(fs, outPath);

    MatrixCharacteristics mc = new MatrixCharacteristics();
    return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfHelper.java

License:Open Source License

public void loadTfMetadata(JSONObject spec) throws IOException {
    Path txMtdDir = (DistributedCache.getLocalCacheFiles(_rJob))[0];
    FileSystem localFS = FileSystem.getLocal(_rJob);

    // load transformation metadata 
    _mia.loadTxMtd(_rJob, localFS, txMtdDir);
    _ra.loadTxMtd(_rJob, localFS, txMtdDir);
    _ba.loadTxMtd(_rJob, localFS, txMtdDir);

    // associate recode maps and bin definitions with dummycoding agent,
    // as recoded and binned columns are typically dummycoded
    _da.setRecodeMaps(_ra.getRecodeMaps());
    _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
    _da.loadTxMtd(_rJob, localFS, txMtdDir);

    FileSystem fs;//from w w  w .ja va  2s  .c  o  m
    fs = FileSystem.get(_rJob);
    Path thisPath = new Path(_rJob.get("map.input.file")).makeQualified(fs);
    String thisfile = thisPath.toString();

    Path smallestFilePath = new Path(_rJob.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
    if (thisfile.toString().equals(smallestFilePath.toString()))
        _partFileWithHeader = true;
    else
        _partFileWithHeader = false;

}

From source file:com.ibm.bi.dml.runtime.transform.TfUtils.java

License:Open Source License

protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err) throws IOException {
    // check non-existing file
    if (!fs.exists(path))
        if (err)/* w  ww  . j ava2 s.  co m*/
            throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");
        else
            return false;

    // check for empty file
    if (MapReduceTool.isFileEmpty(fs, path.toString()))
        if (err)
            throw new EOFException("Empty input file " + path.toString() + ".");
        else
            return false;

    return true;
}

From source file:com.ibm.bi.dml.runtime.transform.TfUtils.java

License:Open Source License

public static String getPartFileName(JobConf job) throws IOException {
    FileSystem fs = FileSystem.get(job);
    Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs);
    return thisPath.toString();
}