Example usage for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri()

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration.java

License:Open Source License

public static ArrayList<Byte> getInputMatrixIndexesInMapper(JobConf job) throws IOException {
    String[] matrices = job.getStrings(INPUT_MATRICIES_DIRS_CONFIG);
    String str = job.get(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG);
    byte[] indexes;
    if (str == null || str.isEmpty()) {
        indexes = new byte[matrices.length];
        for (int i = 0; i < indexes.length; i++)
            indexes[i] = (byte) i;
    } else {/* w  ww. j  av a2 s . c o m*/
        String[] strs = str.split(Instruction.INSTRUCTION_DELIM);
        indexes = new byte[strs.length];
        for (int i = 0; i < strs.length; i++)
            indexes[i] = Byte.parseByte(strs[i]);
    }

    int numMatrices = matrices.length;
    if (numMatrices > Byte.MAX_VALUE)
        throw new RuntimeException("number of matrices is too large > " + Byte.MAX_VALUE);
    for (int i = 0; i < matrices.length; i++)
        matrices[i] = new Path(matrices[i]).toString();

    FileSystem fs = FileSystem.get(job);
    Path thisFile = new Path(job.get("map.input.file")).makeQualified(fs);

    //Path p=new Path(thisFileName);

    Path thisDir = thisFile.getParent().makeQualified(fs);
    ArrayList<Byte> representativeMatrixes = new ArrayList<Byte>();
    for (int i = 0; i < matrices.length; i++) {
        Path p = new Path(matrices[i]).makeQualified(fs);
        if (thisFile.toUri().compareTo(p.toUri()) == 0 || thisDir.toUri().compareTo(p.toUri()) == 0)
            representativeMatrixes.add(indexes[i]);
    }
    return representativeMatrixes;
}

From source file:com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration.java

License:Open Source License

public static void setupDistCacheInputs(JobConf job, String indices, String pathsString,
        ArrayList<String> paths) {
    job.set(DISTCACHE_INPUT_INDICES, indices);
    job.set(DISTCACHE_INPUT_PATHS, pathsString);
    Path p = null;

    for (String spath : paths) {
        p = new Path(spath);

        DistributedCache.addCacheFile(p.toUri(), job);
        DistributedCache.createSymlink(job);
    }/*from   w  w w  .ja  v  a2  s. c  o  m*/
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMR.java

License:Open Source License

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String specPath,
        String mapsPath, String tmpPath, String outputPath, String partOffsetsFile,
        CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter,
        int replication, String headerLine) throws Exception {

    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);

    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };

    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);

    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath },
            new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);

    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);

    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            rblkInst, null, otherInst, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);/* w w w .j a  va 2s.c  o  m*/

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 },
            new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specPath);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(ret.stats, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java

License:Open Source License

public static JobReturn runJob(String inputPath, String specPath, String mapsPath, String tmpPath,
        String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols,
        int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(ApplyTfCSVMR.class);
    job.setJobName("ApplyTfCSV");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfCSVMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfCSVMapper.class);
    job.setNumReduceTasks(0);/*from   w  w w  . j  a v  a 2  s.c  om*/

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(partOffsetsFile);
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInt("dfs.replication", replication);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(outputPath);
    FileSystem fs = FileSystem.get(job);
    fs.delete(outPath, true);
    FileOutputFormat.setOutputPath(job, outPath);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC_FILE, specPath);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    // Run the job
    RunningJob runjob = JobClient.runJob(job);

    // Since transform CSV produces part files w/ prefix transform-part-*,
    // delete all the "default" part-..... files
    deletePartFiles(fs, outPath);

    MatrixCharacteristics mc = new MatrixCharacteristics();
    return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful());
}

From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java

License:Apache License

public FSDataInputStream open(Path path, int bufferSize) throws IOException {
    CrailFile fileInfo = null;/*from  w ww .  j  a va 2 s .  com*/
    try {
        fileInfo = dfs.lookup(path.toUri().getRawPath()).get().asFile();
        CrailBufferedInputStream inputStream = fileInfo.getBufferedInputStream(fileInfo.getCapacity());
        return new CrailHDFSInputStream(inputStream);
    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java

License:Apache License

@Override
public FSDataOutputStream create(Path path, FsPermission permission, boolean overwrite, int bufferSize,
        short replication, long blockSize, Progressable progress) throws IOException {
    CrailFile fileInfo = null;//w  ww  .j  av a 2 s .  co m
    try {
        fileInfo = dfs.create(path.toUri().getRawPath(), CrailNodeType.DATAFILE, CrailStorageClass.PARENT,
                CrailLocationClass.PARENT).get().asFile();
    } catch (Exception e) {
        if (e.getMessage().contains(RpcErrors.messages[RpcErrors.ERR_PARENT_MISSING])) {
            fileInfo = null;
        } else {
            throw new IOException(e);
        }
    }

    if (fileInfo == null) {
        Path parent = path.getParent();
        this.mkdirs(parent, FsPermission.getDirDefault());
        try {
            fileInfo = dfs.create(path.toUri().getRawPath(), CrailNodeType.DATAFILE, CrailStorageClass.PARENT,
                    CrailLocationClass.PARENT).get().asFile();
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    CrailBufferedOutputStream outputStream = null;
    if (fileInfo != null) {
        try {
            fileInfo.syncDir();
            outputStream = fileInfo.getBufferedOutputStream(Integer.MAX_VALUE);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    if (outputStream != null) {
        return new CrailHDFSOutputStream(outputStream, statistics);
    } else {
        throw new IOException("Failed to create file, path " + path.toString());
    }
}

From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java

License:Apache License

@Override
public boolean rename(Path src, Path dst) throws IOException {
    try {//from  ww  w .j av  a2 s .c o  m
        CrailNode file = dfs.rename(src.toUri().getRawPath(), dst.toUri().getRawPath()).get();
        if (file != null) {
            file.syncDir();
        }
        return file != null;
    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java

License:Apache License

@Override
public boolean delete(Path path, boolean recursive) throws IOException {
    try {//from  w w w  .  j a  v a2  s  .c  om
        CrailNode file = dfs.delete(path.toUri().getRawPath(), recursive).get();
        if (file != null) {
            file.syncDir();
        }
        return file != null;
    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java

License:Apache License

@Override
public FileStatus[] listStatus(Path path) throws FileNotFoundException, IOException {
    try {//  w  w w.j av  a 2s. com
        CrailNode node = dfs.lookup(path.toUri().getRawPath()).get();
        Iterator<String> iter = node.getType() == CrailNodeType.DIRECTORY ? node.asDirectory().listEntries()
                : node.asMultiFile().listEntries();
        ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
        while (iter.hasNext()) {
            String filepath = iter.next();
            CrailNode directFile = dfs.lookup(filepath).get();
            if (directFile != null) {
                FsPermission permission = FsPermission.getFileDefault();
                if (directFile.getType().isDirectory()) {
                    permission = FsPermission.getDirDefault();
                }
                FileStatus status = new FileStatus(directFile.getCapacity(), directFile.getType().isContainer(),
                        CrailConstants.SHADOW_REPLICATION, CrailConstants.BLOCK_SIZE,
                        directFile.getModificationTime(), directFile.getModificationTime(), permission,
                        CrailConstants.USER, CrailConstants.USER,
                        new Path(filepath).makeQualified(this.getUri(), this.workingDir));
                statusList.add(status);
            }
        }
        FileStatus[] list = new FileStatus[statusList.size()];
        statusList.toArray(list);
        return list;
    } catch (Exception e) {
        throw new FileNotFoundException(path.toUri().getRawPath());
    }
}

From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java

License:Apache License

@Override
public boolean mkdirs(Path path, FsPermission permission) throws IOException {
    try {/*from w w w.  j  a v  a  2s  .  c  o m*/
        CrailDirectory file = dfs.create(path.toUri().getRawPath(), CrailNodeType.DIRECTORY,
                CrailStorageClass.PARENT, CrailLocationClass.DEFAULT).get().asDirectory();
        file.syncDir();
        return true;
    } catch (Exception e) {
        if (e.getMessage().contains(RpcErrors.messages[RpcErrors.ERR_PARENT_MISSING])) {
            Path parent = path.getParent();
            mkdirs(parent);
            return mkdirs(path);
        } else if (e.getMessage().contains(RpcErrors.messages[RpcErrors.ERR_FILE_EXISTS])) {
            return true;
        } else {
            throw new IOException(e);
        }
    }
}