List of usage examples for org.apache.hadoop.fs FileSystem rename
public abstract boolean rename(Path src, Path dst) throws IOException;
From source file:org.apache.tez.mapreduce.output.LocalOnFileSorterOutput.java
License:Apache License
@Override public List<Event> close() throws IOException { LOG.debug("Closing LocalOnFileSorterOutput"); super.close(); TezTaskOutput mapOutputFile = sorter.getMapOutput(); FileSystem localFs = FileSystem.getLocal(conf); Path src = mapOutputFile.getOutputFile(); Path dst = mapOutputFile.getInputFileForWrite(getContext().getTaskIndex(), localFs.getFileStatus(src).getLen()); LOG.info("Renaming src = " + src + ", dst = " + dst); if (LOG.isDebugEnabled()) { LOG.debug("Renaming src = " + src + ", dst = " + dst); }/*from w w w . j av a 2s.co m*/ localFs.rename(src, dst); return null; }
From source file:org.apache.tez.runtime.library.output.LocalOnFileSorterOutput.java
License:Apache License
@Override public List<Event> close() throws IOException { LOG.debug("Closing LocalOnFileSorterOutput"); super.close(); TezTaskOutput mapOutputFile = sorter.getMapOutput(); FileSystem localFs = FileSystem.getLocal(conf); Path src = mapOutputFile.getOutputFile(); Path dst = mapOutputFile.getInputFileForWrite(outputContext.getTaskIndex(), localFs.getFileStatus(src).getLen()); LOG.info("Renaming src = " + src + ", dst = " + dst); if (LOG.isDebugEnabled()) { LOG.debug("Renaming src = " + src + ", dst = " + dst); }/*from w ww .j a v a 2 s. c o m*/ localFs.rename(src, dst); return null; }
From source file:org.archive.access.nutch.jobs.NutchwaxLinkDbMerger.java
License:Open Source License
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { JobConf job = NutchwaxLinkDb.createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { job.addInputPath(new Path(dbs[i], LinkDb.CURRENT_NAME)); }//from w ww .j a va 2 s . c om JobClient.runJob(job); FileSystem fs = FileSystem.get(getConf()); fs.mkdirs(output); fs.rename(job.getOutputPath(), new Path(output, LinkDb.CURRENT_NAME)); }
From source file:org.archive.nutchwax.PageRankDb.java
License:Apache License
public static void install(JobConf job, Path pageRankDb) throws IOException { Path newPageRankDb = FileOutputFormat.getOutputPath(job); FileSystem fs = new JobClient(job).getFs(); Path old = new Path(pageRankDb, "old"); Path current = new Path(pageRankDb, CURRENT_NAME); if (fs.exists(current)) { if (fs.exists(old)) fs.delete(old, true);/*from w ww .j a va 2 s. c o m*/ fs.rename(current, old); } fs.mkdirs(pageRankDb); fs.rename(newPageRankDb, current); if (fs.exists(old)) fs.delete(old, true); LockUtil.removeLockFile(fs, new Path(pageRankDb, LOCK_NAME)); }
From source file:org.archive.nutchwax.PageRankDbMerger.java
License:Apache License
public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { JobConf job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { FileInputFormat.addInputPath(job, new Path(dbs[i], PageRankDb.CURRENT_NAME)); }/*ww w. ja va 2 s. c om*/ JobClient.runJob(job); FileSystem fs = FileSystem.get(getConf()); fs.mkdirs(output); fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, PageRankDb.CURRENT_NAME)); }
From source file:org.avenir.tree.DataPartitioner.java
License:Apache License
/** * @param outPath/*from w ww .ja v a 2 s. co m*/ * @param segmentCount * @param conf * @throws IOException */ private void moveOutputToSegmentDir(String outPath, int segmentCount, Configuration conf) throws IOException { FileSystem fileSystem = FileSystem.get(conf); for (int i = 0; i < segmentCount; ++i) { //create segment dir String dir = outPath + "/segment=" + i + "/data"; Path segmentPath = new Path(dir); fileSystem.mkdirs(segmentPath); //move output to segment dir Path srcFile = new Path(outPath + "/part-r-0000" + i); Path dstFile = new Path(outPath + "/segment=" + i + "/data/partition.txt"); fileSystem.rename(srcFile, dstFile); } fileSystem.close(); }
From source file:org.bgi.flexlab.gaea.data.mapreduce.output.cram.GaeaKeyIgnoringCramRecordWriter.java
License:Open Source License
@Override public void close(TaskAttemptContext ctx) throws IOException { cramContainerStream.finish(true);// w w w .j ava2 s. c om origOutput.close(); if (rename) { final FileSystem srcFS = outputPath.getFileSystem(ctx.getConfiguration()); if (this.sample != null) { Path newName = new Path(outputPath.getParent() + "/" + sample + ".sorted.cram"); srcFS.rename(outputPath, newName); } } }
From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java
License:Open Source License
static void parseCommandLineArgs(String[] argv, Configuration conf) { CommandLineParser parser = new PosixParser(); Options options = new Options(); Option gatkdLocOpt = OptionBuilder.withArgName("depjar_loc").hasArg() .withDescription("Complete HDFS path of gatk dependency jar").create("djarloc"); options.addOption(gatkdLocOpt);//from ww w . ja v a 2 s . c o m Option bwaLocOpt = OptionBuilder.withArgName("bwa_loc").hasArg() .withDescription("Complete HDFS path of bwa binary or bwa.exe file").create("bwaloc"); options.addOption(bwaLocOpt); Option fq1Opt = OptionBuilder.withArgName("fastq_file1").hasArg() .withDescription("Complete HDFS path or path relative to user directory for 1st fastq file") .create("r1"); options.addOption(fq1Opt); Option fq2Opt = OptionBuilder.withArgName("fastq_file2").hasArg() .withDescription("Complete HDFS path or path relative to user directory for 2nd fastq file") .create("r2"); options.addOption(fq2Opt); Option bamOpt = OptionBuilder.withArgName("bam_directory").hasArg() .withDescription( "Complete HDFS directory path or path relative to user directory for input BAM file") .create("b"); options.addOption(bamOpt); Option outOpt = OptionBuilder.withArgName("output_directory").hasArg() .withDescription("Complete HDFS path or path relative to user directory for output directory") .create("o"); options.addOption(outOpt); Option rSizeOpt = OptionBuilder.withArgName("fastq_read_size").hasArg() .withDescription("Number of bytes of a read sequence in input FastQ file").create("rsize"); options.addOption(rSizeOpt); Option rPSplitOpt = OptionBuilder.withArgName("reads_per_map_split").hasArg() .withDescription("Optional number of reads to be processed by a mapper").create("reads_per_split"); options.addOption(rPSplitOpt); Option nRedOpt = OptionBuilder.withArgName("number_of_reducers").hasArg() .withDescription("Optional number of reducers").create("nred"); options.addOption(nRedOpt); Option nThreadOpt = OptionBuilder.withArgName("number_of_threads").hasArg() .withDescription("Optional number of threads").create("nthreads"); options.addOption(nThreadOpt); Option refFileOpt = OptionBuilder.withArgName("path_to_reference_dir").hasArg() .withDescription("Complete HDFS path of reference directory").create("ref"); options.addOption(refFileOpt); Option kSiteFileOpt = OptionBuilder.withArgName("path_to_knownsites_dir").hasArg() .withDescription("Complete HDFS path of known-sites db directory").create("dbfile"); options.addOption(kSiteFileOpt); Option platformOpt = OptionBuilder.withArgName("Linux/Windows").hasArg() .withDescription("Platform to run on").create("p"); options.addOption(platformOpt); Option noAlignOpt = new Option("na", "noalign", false, "Don't run Alignment stage"); options.addOption(noAlignOpt); Option noReAlignOpt = new Option("nra", "norealign", false, "Do not run Local Realignment stage"); options.addOption(noReAlignOpt); Option noMarkDupOpt = new Option("nmd", "nomarkdup", false, "Do not run Mark Duplicates stage"); options.addOption(noMarkDupOpt); Option noQRecabOpt = new Option("nqr", "noqrecab", false, "Do not run Quality Recalibration stage"); options.addOption(noQRecabOpt); Option noVarOpt = new Option("nv", "novariant", false, "Do not run Structural Variant stage"); options.addOption(noVarOpt); Option noFVarOpt = new Option("nfv", "nofvariant", false, "Do not run Filter Variant stage"); options.addOption(noFVarOpt); Option noMerOpt = new Option("nm", "nomresults", false, "Do not Merge Results"); options.addOption(noMerOpt); Option isXVariantOpt = new Option("xv", "xvariant", false, "enable flag, if variant calling should be done independently for INDELs and SNPs"); options.addOption(isXVariantOpt); try { // parse the command line arguments String[] args = new GenericOptionsParser(conf, options, argv).getRemainingArgs(); CommandLine line = parser.parse(options, args); if (line.hasOption(noAlignOpt.getOpt())) noalign = true; if (line.hasOption(noReAlignOpt.getOpt())) norealign = true; if (line.hasOption(noMarkDupOpt.getOpt())) nomarkdup = true; if (line.hasOption(noQRecabOpt.getOpt())) noqrecab = true; if (line.hasOption(noVarOpt.getOpt())) novariant = true; if (line.hasOption(noFVarOpt.getOpt())) nofvariant = true; if (line.hasOption(noMerOpt.getOpt())) nomresults = true; if (line.hasOption(fq1Opt.getOpt()) && line.hasOption(bamOpt.getOpt())) { throw new ParseException( "Invalid Usage: fastq file and BAM file cannot be given together as input"); } if (line.hasOption(fq2Opt.getOpt()) && !line.hasOption(fq1Opt.getOpt())) { throw new ParseException("Invalid Usage: fastq file2 is invalid without fastq file1"); } if (!line.hasOption(fq2Opt.getOpt()) && !line.hasOption(fq1Opt.getOpt()) && !line.hasOption(bamOpt.getOpt())) { throw new ParseException( "Invalid Usage: Either the fastq file or BAM file has to be provided as input"); } if (line.hasOption(gatkdLocOpt.getOpt())) { gatk_binary_loc = line.getOptionValue(gatkdLocOpt.getOpt()); validatePath(gatk_binary_loc, conf); } else { throw new ParseException( "Invalid Usage: GATK dependency jar location (-djarloc) is mandatory for running the pipeline"); } if (!noalign) { if (line.hasOption(fq1Opt.getOpt())) { readFile1 = line.getOptionValue(fq1Opt.getOpt()); validatePath(readFile1, conf); fqInput = (new Path(readFile1).getParent()).toString(); } if (line.hasOption(fq2Opt.getOpt())) { readFile2 = line.getOptionValue(fq2Opt.getOpt()); conf.setBoolean("gatk.hadoop.pairedend", true); validatePath(readFile2, conf); conf.set("gatk.hadoop.readfile2", readFile2); ; } if (line.hasOption(rSizeOpt.getOpt())) { fq_read_size = Integer.parseInt(line.getOptionValue(rSizeOpt.getOpt())); } else { throw new ParseException("Invalid Usage: read size (-rsize) is mandatory for Alignment"); } if (line.hasOption(bwaLocOpt.getOpt())) { bwa_binary_loc = line.getOptionValue(bwaLocOpt.getOpt()); validatePath(bwa_binary_loc, conf); } else { throw new ParseException( "Invalid Usage: bwa binary/exe location (-bwaloc) is mandatory for Alignment"); } if (line.hasOption(rPSplitOpt.getOpt())) { reads_per_split = Integer.parseInt(line.getOptionValue(rPSplitOpt.getOpt())); } } if (line.hasOption(nRedOpt.getOpt())) { nReducers = Integer.parseInt(line.getOptionValue(nRedOpt.getOpt())); } if (line.hasOption(nThreadOpt.getOpt())) { nThreads = Integer.parseInt(line.getOptionValue(nThreadOpt.getOpt())); conf.setInt("gatk.hadoop.nthreads", nThreads); } if (line.hasOption(bamOpt.getOpt())) { int rcount = 0; BAMInputPath = line.getOptionValue(bamOpt.getOpt()); validatePath(BAMInputPath, conf); Path BAMPath = new Path(BAMInputPath); FileSystem fs = BAMPath.getFileSystem(conf); FileStatus[] content = fs.listStatus(BAMPath); for (int i = 0; i < content.length; i++) { String filename = content[i].getPath().getName(); if (filename.endsWith(".bam")) { String prefix = filename.substring(0, 6); try { Long value = Long.valueOf(prefix); } catch (NumberFormatException e) { String tmpFile = BAMInputPath + Path.SEPARATOR + String.format("%06d", rcount) + "-" + filename; boolean rename = fs.rename(content[i].getPath(), new Path(tmpFile)); } rcount++; } } } if (line.hasOption(outOpt.getOpt())) { outputDir = line.getOptionValue(outOpt.getOpt()); if (!(new Path(outputDir).getFileSystem(conf).mkdirs(new Path(outputDir)))) { throw new Exception("MKDIR failure"); } if (!noalign) { BWAOutPath = outputDir + Path.SEPARATOR + "AlignerOut"; SortBWAOutPath = outputDir + Path.SEPARATOR + "SortedAlignerOut"; BAMInputPath = outputDir + Path.SEPARATOR + "BAMInput"; } IndelOutPath = outputDir + Path.SEPARATOR + "IndelRealignOut"; RmdupOutPath = outputDir + Path.SEPARATOR + "DedupOut"; RecalOutPath = outputDir + Path.SEPARATOR + "RecalibrationOut"; FinalBAMPath = outputDir + Path.SEPARATOR + "FinalBAMOut"; } else { throw new ParseException("Invalid Usage: output directory is mandatory"); } if (line.hasOption(refFileOpt.getOpt())) { Path refFileDir = new Path(line.getOptionValue(refFileOpt.getOpt())); FileSystem fs = refFileDir.getFileSystem(conf); FileStatus[] content = fs.listStatus(refFileDir); for (int i = 0; i < content.length; i++) { if ((content[i].getPath().getName()).endsWith(".fa") || (content[i].getPath().getName()).endsWith(".fasta")) { refFileLoc = content[i].getPath().toString(); } } validatePath(refFileLoc, conf); refFileName = refFileLoc.substring(0, refFileLoc.lastIndexOf(".")); } else { throw new ParseException("Invalid Usage: reference fasta file is mandatory"); } if (line.hasOption(kSiteFileOpt.getOpt())) { Path knownSitesDir = new Path(line.getOptionValue(kSiteFileOpt.getOpt())); FileSystem fs = knownSitesDir.getFileSystem(conf); FileStatus[] content = fs.listStatus(knownSitesDir); for (int i = 0; i < content.length; i++) { if ((content[i].getPath().getName()).endsWith(".vcf")) { knownSitesLoc = content[i].getPath().toString(); } } validatePath(knownSitesLoc, conf); } if (line.hasOption(platformOpt.getOpt())) { platform = line.getOptionValue(platformOpt.getOpt()); if (platform.equalsIgnoreCase("Linux")) { is_azure = false; conf.setBoolean("gatk.hadoop.isazure", false); } } if (line.hasOption(isXVariantOpt.getOpt())) { xVariantCall = true; } } catch (ParseException exp) { System.out.println(exp.getMessage()); if (printUsage) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("hadoop jar {/local/path/to/SeqInCloud.jar} {options}", options); } System.exit(-1); } catch (Exception exp) { System.out.println("Command line parsing error: " + exp.getMessage()); System.exit(-1); } }
From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java
License:Open Source License
public static void main(String[] args) { String accessKey = args[0];//from w w w . java 2s. c om String secretKey = args[1]; String paths[] = { // "2008/06", // "2008/07", // "2008/08", // "2008/09", // "2008/10", // "2008/11", "2009" }; for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) { LOG.info("Processing Path:" + paths[pathIndex]); JobConf job = new JobConf(S3GetMetdataJob.class); Path tempDir = new Path( job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir); System.out.println("Output Path is:" + tempDir); job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]); // setup s3 properties JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); ARCSplitCalculator.setFilesPerSplit(job, 25); // set up arc reader properties ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // and set up input format ... job.setInputFormat(ARCInputFormat.class); // set mapper ... job.setMapRunnerClass(S3GetMetdataJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlURLMetadata.class); // reduce output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlURLMetadata.class); // double the number of reducers ... // job.setNumReduceTasks(job.getNumReduceTasks() * 2); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result"); LOG.info("Copying Job Output to:" + finalPath); FileSystem fs = FileSystem.get(job); try { fs.mkdirs(finalPath.getParent()); fs.rename(tempDir, finalPath); LOG.info("Copied Job Output to:" + finalPath); } finally { // fs.close(); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } }
From source file:org.commoncrawl.mapred.ec2.parser.OutputCommitter.java
License:Open Source License
private void moveTaskOutputs(TaskAttemptContext context, FileSystem fs, Path jobOutputDir, Path taskOutput) throws IOException { TaskAttemptID attemptId = context.getTaskAttemptID(); context.getProgressible().progress(); if (fs.isFile(taskOutput)) { Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTempTaskOutputPath(context)); LOG.info("Renaming:" + taskOutput + " to:" + finalOutputPath); if (!fs.rename(taskOutput, finalOutputPath)) { LOG.info("Rename Failed for:" + taskOutput + " to:" + finalOutputPath + " Trying Delete and then Rename"); if (!fs.delete(finalOutputPath, true)) { throw new IOException("Failed to delete earlier output of task: " + attemptId); }/*from w ww . ja v a 2s .c o m*/ LOG.info("Renaming:" + taskOutput + " to: " + finalOutputPath); if (!fs.rename(taskOutput, finalOutputPath)) { throw new IOException("Failed to save output of task: " + attemptId); } } LOG.info("Moved " + taskOutput + " to " + finalOutputPath); } else if (fs.getFileStatus(taskOutput).isDir()) { FileStatus[] paths = fs.listStatus(taskOutput); Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTempTaskOutputPath(context)); LOG.info("Moving " + taskOutput + " to " + finalOutputPath); fs.mkdirs(finalOutputPath); if (paths != null) { for (FileStatus path : paths) { LOG.info("Moving " + path.getPath()); moveTaskOutputs(context, fs, jobOutputDir, path.getPath()); } } } }