List of usage examples for org.apache.hadoop.fs Path makeQualified
@Deprecated
public Path makeQualified(FileSystem fs)
From source file:com.datasalt.utils.commons.TestRepoTool.java
License:Apache License
@Test public void test() throws IOException { FileSystem fs = FileSystem.getLocal(getConf()); Path repo = new Path("repoTest87463829"); HadoopUtils.deleteIfExists(fs, repo); RepoTool tool = new RepoTool(repo, "pkg", fs); assertNull(tool.getNewestPackageWithStatus(PackageStatus.NOT_DEFINED)); Path pkg1 = tool.newPackage(); assertEquals("pkg", pkg1.getName().substring(0, 3)); assertEquals(pkg1.makeQualified(fs), tool.getNewestPackageWithStatus(PackageStatus.NOT_DEFINED)); Path pkg2 = tool.newPackage(); assertEquals(pkg2.makeQualified(fs), tool.getNewestPackageWithStatus(PackageStatus.NOT_DEFINED)); assertEquals(2, tool.getPackages().length); RepoTool.setStatus(fs, pkg2, PackageStatus.FINISHED); assertEquals(pkg2.makeQualified(fs), tool.getNewestPackageWithStatus(PackageStatus.FINISHED)); HadoopUtils.deleteIfExists(fs, repo); }
From source file:com.ebay.erl.mobius.core.criterion.TupleRestrictions.java
License:Apache License
/** * Create a tuple criterion that only accepts tuples when the value * of the <code>column</code> are presented in the given <code>file</code> * <p>//from w ww . j av a 2s . c o m * * The assumption of the file is that, it's single column and one to many * line text file. Each line is read into a case insensitive set, and * using the set to check the value of the <code>column</code> within * the set or not. * * * @param column the name of a column to be tested that whether its value is in * the given <code>file</code> or not * * @param file a single column and multiple lines of file that contains strings/numbers, * each line is treated as a single unit. * * @return an instance of {@link TupleCriterion} that extracts only the records * when the value of its <code>column</code> are presented in the given * <code>file</code>. * * @throws FileNotFoundException if the given file cannot be found. */ public static TupleCriterion within(final String column, File file) throws FileNotFoundException { final File f = TupleRestrictions.checkFileExist(file); return new TupleCriterion() { private static final long serialVersionUID = -1121221619118915652L; private Set<String> set; @Override public void setConf(Configuration conf) { try { if (conf.get("tmpfiles") == null || conf.get("tmpfiles").trim().length() == 0) { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf)); } else { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf) + "," + conf.get("tmpfiles")); } } catch (IOException e) { throw new IllegalArgumentException(e); } } /** * COPIED FROM org.apache.hadoop.util.GenericOptionsParser */ private String validateFiles(String files, Configuration conf) throws IOException { if (files == null) return null; String[] fileArr = files.split(","); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; Path path = new Path(tmp); URI pathURI = path.toUri(); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { // default to the local file system // check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); try { fs.close(); } catch (IOException e) { } ; } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); } @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { if (set == null) { set = new CaseInsensitiveTreeSet(); BufferedReader br = null; try { br = new BufferedReader(new FileReader(new File(f.getName()))); String newLine = null; while ((newLine = br.readLine()) != null) { this.set.add(newLine); } } catch (IOException e) { throw new RuntimeException(e); } finally { try { br.close(); } catch (Throwable e) { } } } String value = tuple.getString(column); if (value != null) { return this.set.contains(value); } else { return false; } } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; }
From source file:com.github.gaoyangthu.demo.mapred.terasort.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf();//from w ww . jav a2 s . c o m Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job); LOG.info("done"); return 0; }
From source file:com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.java
License:Open Source License
/** * Renames src to dst. Src must not be equal to the filesystem root. * * @param src Source path.//ww w . j a v a2 s. c o m * @param dst Destination path. * @return true if rename succeeds. * @throws FileNotFoundException if src does not exist. * @throws IOException if an error occurs. */ @Override public boolean rename(Path src, Path dst) throws IOException { // Even though the underlying GCSFS will also throw an IAE if src is root, since our filesystem // root happens to equal the global root, we want to explicitly check it here since derived // classes may not have filesystem roots equal to the global root. if (src.makeQualified(this).equals(getFileSystemRoot())) { LOG.debug("GHFS.rename: src is root: '{}'", src); return false; } long startTime = System.nanoTime(); Preconditions.checkArgument(src != null, "src must not be null"); Preconditions.checkArgument(dst != null, "dst must not be null"); checkOpen(); try { LOG.debug("GHFS.rename: {} -> {}", src, dst); URI srcPath = getGcsPath(src); URI dstPath = getGcsPath(dst); gcsfs.rename(srcPath, dstPath); } catch (IOException e) { LOG.debug("GHFS.rename", e); return false; } long duration = System.nanoTime() - startTime; increment(Counter.RENAME); increment(Counter.RENAME_TIME, duration); return true; }
From source file:com.google.cloud.hadoop.fs.gcs.GoogleHadoopGlobalRootedFileSystem.java
License:Open Source License
@Override public URI getGcsPath(Path hadoopPath) { LOG.debug("GHFS.getGcsPath: {}", hadoopPath); // Convert to fully qualified absolute path; the Path object will callback to get our current // workingDirectory as part of fully resolving the path. Path resolvedPath = hadoopPath.makeQualified(this); // Handle root. if (resolvedPath.equals(getFileSystemRoot())) { return GoogleCloudStorageFileSystem.GCS_ROOT; }/* ww w.jav a 2 s . c o m*/ // Need to convert scheme to GCS scheme and possibly move bucket into authority String authorityString = null; if (!Strings.isNullOrEmpty(resolvedPath.toUri().getAuthority())) { authorityString = "/" + resolvedPath.toUri().getAuthority(); } else { authorityString = ""; } // Construct GCS path uri. String path = GoogleCloudStorageFileSystem.SCHEME + ":/" + authorityString + resolvedPath.toUri().getPath(); URI gcsPath = null; try { gcsPath = new URI(path); } catch (URISyntaxException e) { String msg = String.format("Invalid path: %s", hadoopPath); throw new IllegalArgumentException(msg, e); } LOG.debug("GHFS.getGcsPath: {} -> {}", hadoopPath, gcsPath); return gcsPath; }
From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java
License:Open Source License
/** * Method to find the first (part)file in the order given by <code>fs.listStatus()</code> among all (part)files in <code>inpathPath</code>. * /*w w w. ja v a2 s .c o m*/ * @param job * @param inputPath * @return * @throws IOException * @throws FileNotFoundException */ public static String findSmallestFile(JobConf job, String inputPath) throws FileNotFoundException, IOException { String smallestFile = null; Path p = new Path(inputPath); FileSystem fs = p.getFileSystem(job); if (!fs.isDirectory(p)) smallestFile = p.makeQualified(fs).toString(); else { FileStatus[] stats = fs.listStatus(p, hiddenFileFilter); if (stats.length == 0) smallestFile = ""; else { smallestFile = stats[0].getPath().toString(); for (int j = 1; j < stats.length; j++) { String f = stats[j].getPath().toString(); if (f.compareTo(smallestFile) < 0) smallestFile = f; } } } return smallestFile; }
From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java
License:Open Source License
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception { String[] smallestFiles = new String[inputs.length]; JobConf job = new JobConf(); for (int i = 0; i < inputs.length; i++) { smallestFiles[i] = findSmallestFile(job, inputs[i]); }/*from ww w .j a v a2 s . co m*/ for (int i = 0; i < inputs.length; i++) { Path p = new Path(inputs[i]); FileSystem fs = p.getFileSystem(job); if (!fs.isDirectory(p)) smallestFiles[i] = p.makeQualified(fs).toString(); else { FileStatus[] stats = fs.listStatus(p, hiddenFileFilter); if (stats.length == 0) smallestFiles[i] = ""; else { smallestFiles[i] = stats[0].getPath().toString(); for (int j = 1; j < stats.length; j++) { String f = stats[j].getPath().toString(); if (f.compareTo(smallestFiles[i]) < 0) smallestFiles[i] = f; } } } } AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(inputs, inputInfos, brlens, bclens, reblockInstructions, replication, smallestFiles); for (int i = 0; i < rlens.length; i++) if ((rlens[i] > 0 && rlens[i] != ret1.rlens[i]) || (clens[i] > 0 && clens[i] != ret1.clens[i])) throw new RuntimeException("Dimension doesn't mach for input matrix " + i + ", expected (" + rlens[i] + ", " + clens[i] + ") but real (" + ret1.rlens[i] + ", " + ret1.clens[i] + ")"); JobReturn ret = CSVReblockMR.runCSVReblockJob(null, inputs, inputInfos, ret1.rlens, ret1.clens, brlens, bclens, reblockInstructions, otherInstructionsInReducer, numReducers, replication, resultIndexes, outputs, outputInfos, ret1.counterFile, smallestFiles); return ret; }
From source file:com.ibm.bi.dml.runtime.matrix.SortMR.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception { boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes; String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output; JobConf job = new JobConf(SortMR.class); job.setJobName("SortMR"); //setup partition file String pfname = MRJobConfiguration.setUpSortPartitionFilename(job); Path partitionFile = new Path(pfname); URI partitionUri = new URI(partitionFile.toString()); //setup input/output paths Path inputDir = new Path(input); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); SamplingSortMRInputFormat.setInputPaths(job, inputDir); Path outpath = new Path(tmpOutput); FileOutputFormat.setOutputPath(job, outpath); MapReduceTool.deleteFileIfExistOnHDFS(outpath, job); //set number of reducers (1 if local mode) if (InfrastructureAnalyzer.isLocalMode(job)) job.setNumReduceTasks(1);/*from www . ja v a2 s. c o m*/ else MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //setup input/output format job.setInputFormat(SamplingSortMRInputFormat.class); SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass); //setup instructions and meta information if (combineInst != null && !combineInst.trim().isEmpty()) job.set(COMBINE_INSTRUCTION, combineInst); job.set(SORT_INSTRUCTION, sortInst); job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight); boolean desc = getSortInstructionDescending(sortInst); job.setBoolean(SORT_DECREASING, desc); MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen); MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile); //setup mapper/reducer/partitioner/output classes if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) { MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass); job.setMapperClass(IndexSortMapper.class); job.setReducerClass(IndexSortReducer.class); job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } else { //default case: SORT w/wo weights MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(CompactOutputFormat.class); job.setMapperClass(ValueSortMapper.class); job.setReducerClass(ValueSortReducer.class); job.setOutputKeyClass(outputInfo.outputKeyClass); //double job.setOutputValueClass(outputInfo.outputValueClass); //int } job.setPartitionerClass(TotalOrderPartitioner.class); //setup distributed cache DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); //setup replication factor job.setInt("dfs.replication", replication); MatrixCharacteristics[] s = new MatrixCharacteristics[1]; s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(s); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //run mr job RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX); numReducers = job.getNumReduceTasks(); //process final meta data long[] counts = new long[numReducers]; long total = 0; for (int i = 0; i < numReducers; i++) { counts[i] = group.getCounter(Integer.toString(i)); total += counts[i]; } //add missing 0s back to the results long missing0s = 0; if (total < rlen * clen) { if (partitionWith0 < 0) throw new RuntimeException("no partition contains 0, which is wrong!"); missing0s = rlen * clen - total; counts[partitionWith0] += missing0s; } else partitionWith0 = -1; if (sortIndexes) { //run builtin job for shifting partially sorted blocks according to global offsets //we do this in this custom form since it would not fit into the current structure //of systemml to output two intermediates (partially sorted data, offsets) out of a //single SortKeys lop boolean success = runjob.isSuccessful(); if (success) { success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output); } MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput); MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success); } else { MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful()); } }
From source file:com.ibm.bi.dml.yarn.DMLYarnClient.java
License:Open Source License
/** * //from w w w . java 2 s . c o m * @param appId * @throws ParseException * @throws IOException * @throws DMLRuntimeException * @throws InterruptedException */ @SuppressWarnings("deprecation") private void copyResourcesToHdfsWorkingDir(YarnConfiguration yconf, String hdfsWD) throws ParseException, IOException, DMLRuntimeException, InterruptedException { FileSystem fs = FileSystem.get(yconf); //create working directory MapReduceTool.createDirIfNotExistOnHDFS(hdfsWD, DMLConfig.DEFAULT_SHARED_DIR_PERMISSION); //serialize the dml config to HDFS file //NOTE: we do not modify and ship the absolute scratch space path of the current user //because this might result in permission issues if the app master is run with a different user //(runtime plan migration during resource reoptimizations now needs to use qualified names //for shipping/reading intermediates) TODO modify resource reoptimizer on prototype integration. Path confPath = new Path(hdfsWD, DML_CONFIG_NAME); FSDataOutputStream fout = fs.create(confPath, true); //_dmlConfig.makeQualifiedScratchSpacePath(); fout.writeBytes(_dmlConfig.serializeDMLConfig() + "\n"); fout.close(); _hdfsDMLConfig = confPath.makeQualified(fs).toString(); LOG.debug("DML config written to HDFS file: " + _hdfsDMLConfig + ""); //serialize the dml script to HDFS file Path scriptPath = new Path(hdfsWD, DML_SCRIPT_NAME); FSDataOutputStream fout2 = fs.create(scriptPath, true); fout2.writeBytes(_dmlScript); fout2.close(); _hdfsDMLScript = scriptPath.makeQualified(fs).toString(); LOG.debug("DML script written to HDFS file: " + _hdfsDMLScript + ""); // copy local jar file to HDFS (try to get the original jar filename) String fname = getLocalJarFileNameFromEnvConst(); if (fname == null) { //get location of unpacked jar classes and repackage (if required) String lclassFile = DMLYarnClient.class.getProtectionDomain().getCodeSource().getLocation().getPath() .toString(); File flclassFile = new File(lclassFile); if (!flclassFile.isDirectory()) //called w/ jar fname = lclassFile; else //called w/ unpacked jar (need to be repackaged) fname = createJar(lclassFile); } Path srcPath = new Path(fname); Path dstPath = new Path(hdfsWD, srcPath.getName()); FileUtil.copy(FileSystem.getLocal(yconf), srcPath, fs, dstPath, false, true, yconf); _hdfsJarFile = dstPath.makeQualified(fs).toString(); LOG.debug( "Jar file copied from local file: " + srcPath.toString() + " to HDFS file: " + dstPath.toString()); }
From source file:com.ibm.jaql.io.hadoop.FileOutputConfigurator.java
License:Apache License
public void setSequential(JobConf conf) throws Exception { registerSerializers(conf);/* w ww. j av a2 s . co m*/ // For an expression, the location is the final file name Path outPath = new Path(location); FileSystem fs = outPath.getFileSystem(conf); outPath = outPath.makeQualified(fs); if (fs.exists(outPath)) { // TODO: Jaql currently has overwrite semantics; add flag to control this if (fs.isFile(outPath)) { fs.delete(outPath, false); } else { // Look for a map-reduce output directory FileStatus[] nonMR = fs.listStatus(outPath, new PathFilter() { boolean onlyOne = true; public boolean accept(Path path) { String name = path.getName(); if (name.matches("([.][.]?)|([.]part-[0-9]+.crc)|(part-[0-9]+)")) { return false; } if (onlyOne) { onlyOne = false; return true; } return false; } }); if (nonMR.length > 0) { throw new IOException( "directory exists and is not a map-reduce output directory: " + nonMR[0].getPath()); } fs.delete(outPath, true); } } // In sequential mode, we will write directly to the output file // and bypass the _temporary directory and rename of the standard // FileOutputCommitter by using our own DirectFileOutputCommitter. FileOutputFormat.setOutputPath(conf, outPath.getParent()); conf.setClass("mapred.output.committer.class", DirectFileOutputCommiter.class, OutputCommitter.class); }