List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration.java
License:Open Source License
public static ArrayList<Byte> getInputMatrixIndexesInMapper(JobConf job) throws IOException { String[] matrices = job.getStrings(INPUT_MATRICIES_DIRS_CONFIG); String str = job.get(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG); byte[] indexes; if (str == null || str.isEmpty()) { indexes = new byte[matrices.length]; for (int i = 0; i < indexes.length; i++) indexes[i] = (byte) i; } else {/* w ww. j av a2 s . c o m*/ String[] strs = str.split(Instruction.INSTRUCTION_DELIM); indexes = new byte[strs.length]; for (int i = 0; i < strs.length; i++) indexes[i] = Byte.parseByte(strs[i]); } int numMatrices = matrices.length; if (numMatrices > Byte.MAX_VALUE) throw new RuntimeException("number of matrices is too large > " + Byte.MAX_VALUE); for (int i = 0; i < matrices.length; i++) matrices[i] = new Path(matrices[i]).toString(); FileSystem fs = FileSystem.get(job); Path thisFile = new Path(job.get("map.input.file")).makeQualified(fs); //Path p=new Path(thisFileName); Path thisDir = thisFile.getParent().makeQualified(fs); ArrayList<Byte> representativeMatrixes = new ArrayList<Byte>(); for (int i = 0; i < matrices.length; i++) { Path p = new Path(matrices[i]).makeQualified(fs); if (thisFile.toUri().compareTo(p.toUri()) == 0 || thisDir.toUri().compareTo(p.toUri()) == 0) representativeMatrixes.add(indexes[i]); } return representativeMatrixes; }
From source file:com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration.java
License:Open Source License
public static void setupDistCacheInputs(JobConf job, String indices, String pathsString, ArrayList<String> paths) { job.set(DISTCACHE_INPUT_INDICES, indices); job.set(DISTCACHE_INPUT_PATHS, pathsString); Path p = null; for (String spath : paths) { p = new Path(spath); DistributedCache.addCacheFile(p.toUri(), job); DistributedCache.createSymlink(job); }/*from w w w .ja v a2 s. c o m*/ }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMR.java
License:Open Source License
public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String specPath, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception { CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst); long[] rlens = new long[] { numRows }; long[] clens = new long[] { numColsAfter }; int[] brlens = new int[] { rblk.brlen }; int[] bclens = new int[] { rblk.bclen }; byte[] realIndexes = new byte[] { rblk.input }; byte[] resultIndexes = new byte[] { rblk.output }; JobConf job = new JobConf(ApplyTfBBMR.class); job.setJobName("ApplyTfBB"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfBBMR.class); // set relevant classes job.setMapperClass(ApplyTfBBMapper.class); MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL); MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); MRJobConfiguration.setCSVReblockInstructions(job, rblkInst); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInst); job.setInt("dfs.replication", replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false); //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed);/* w w w .j a va 2s.c o m*/ //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false); // configure mapper and the mapper output key value pairs job.setMapperClass(ApplyTfBBMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(new Path(partOffsetsFile), "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specPath); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job); Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } return new JobReturn(ret.stats, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java
License:Open Source License
public static JobReturn runJob(String inputPath, String specPath, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(ApplyTfCSVMR.class); job.setJobName("ApplyTfCSV"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfCSVMR.class); // set relevant classes job.setMapperClass(ApplyTfCSVMapper.class); job.setNumReduceTasks(0);/*from w w w . j a v a 2 s.c om*/ // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(partOffsetsFile); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInt("dfs.replication", replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(outputPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specPath); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); // Since transform CSV produces part files w/ prefix transform-part-*, // delete all the "default" part-..... files deletePartFiles(fs, outPath); MatrixCharacteristics mc = new MatrixCharacteristics(); return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful()); }
From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java
License:Apache License
public FSDataInputStream open(Path path, int bufferSize) throws IOException { CrailFile fileInfo = null;/*from w ww . j a va 2 s . com*/ try { fileInfo = dfs.lookup(path.toUri().getRawPath()).get().asFile(); CrailBufferedInputStream inputStream = fileInfo.getBufferedInputStream(fileInfo.getCapacity()); return new CrailHDFSInputStream(inputStream); } catch (Exception e) { throw new IOException(e); } }
From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java
License:Apache License
@Override public FSDataOutputStream create(Path path, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { CrailFile fileInfo = null;//w ww .j av a 2 s . co m try { fileInfo = dfs.create(path.toUri().getRawPath(), CrailNodeType.DATAFILE, CrailStorageClass.PARENT, CrailLocationClass.PARENT).get().asFile(); } catch (Exception e) { if (e.getMessage().contains(RpcErrors.messages[RpcErrors.ERR_PARENT_MISSING])) { fileInfo = null; } else { throw new IOException(e); } } if (fileInfo == null) { Path parent = path.getParent(); this.mkdirs(parent, FsPermission.getDirDefault()); try { fileInfo = dfs.create(path.toUri().getRawPath(), CrailNodeType.DATAFILE, CrailStorageClass.PARENT, CrailLocationClass.PARENT).get().asFile(); } catch (Exception e) { throw new IOException(e); } } CrailBufferedOutputStream outputStream = null; if (fileInfo != null) { try { fileInfo.syncDir(); outputStream = fileInfo.getBufferedOutputStream(Integer.MAX_VALUE); } catch (Exception e) { throw new IOException(e); } } if (outputStream != null) { return new CrailHDFSOutputStream(outputStream, statistics); } else { throw new IOException("Failed to create file, path " + path.toString()); } }
From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java
License:Apache License
@Override public boolean rename(Path src, Path dst) throws IOException { try {//from ww w .j av a2 s .c o m CrailNode file = dfs.rename(src.toUri().getRawPath(), dst.toUri().getRawPath()).get(); if (file != null) { file.syncDir(); } return file != null; } catch (Exception e) { throw new IOException(e); } }
From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java
License:Apache License
@Override public boolean delete(Path path, boolean recursive) throws IOException { try {//from w w w . j a v a2 s .c om CrailNode file = dfs.delete(path.toUri().getRawPath(), recursive).get(); if (file != null) { file.syncDir(); } return file != null; } catch (Exception e) { throw new IOException(e); } }
From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java
License:Apache License
@Override public FileStatus[] listStatus(Path path) throws FileNotFoundException, IOException { try {// w w w.j av a 2s. com CrailNode node = dfs.lookup(path.toUri().getRawPath()).get(); Iterator<String> iter = node.getType() == CrailNodeType.DIRECTORY ? node.asDirectory().listEntries() : node.asMultiFile().listEntries(); ArrayList<FileStatus> statusList = new ArrayList<FileStatus>(); while (iter.hasNext()) { String filepath = iter.next(); CrailNode directFile = dfs.lookup(filepath).get(); if (directFile != null) { FsPermission permission = FsPermission.getFileDefault(); if (directFile.getType().isDirectory()) { permission = FsPermission.getDirDefault(); } FileStatus status = new FileStatus(directFile.getCapacity(), directFile.getType().isContainer(), CrailConstants.SHADOW_REPLICATION, CrailConstants.BLOCK_SIZE, directFile.getModificationTime(), directFile.getModificationTime(), permission, CrailConstants.USER, CrailConstants.USER, new Path(filepath).makeQualified(this.getUri(), this.workingDir)); statusList.add(status); } } FileStatus[] list = new FileStatus[statusList.size()]; statusList.toArray(list); return list; } catch (Exception e) { throw new FileNotFoundException(path.toUri().getRawPath()); } }
From source file:com.ibm.crail.hdfs.CrailHadoopFileSystem.java
License:Apache License
@Override public boolean mkdirs(Path path, FsPermission permission) throws IOException { try {/*from w w w. j a v a 2s . c o m*/ CrailDirectory file = dfs.create(path.toUri().getRawPath(), CrailNodeType.DIRECTORY, CrailStorageClass.PARENT, CrailLocationClass.DEFAULT).get().asDirectory(); file.syncDir(); return true; } catch (Exception e) { if (e.getMessage().contains(RpcErrors.messages[RpcErrors.ERR_PARENT_MISSING])) { Path parent = path.getParent(); mkdirs(parent); return mkdirs(path); } else if (e.getMessage().contains(RpcErrors.messages[RpcErrors.ERR_FILE_EXISTS])) { return true; } else { throw new IOException(e); } } }