List of usage examples for org.apache.hadoop.fs FileSystem rename
public abstract boolean rename(Path src, Path dst) throws IOException;
From source file:com.linkedin.mr_kluj.StagedOutputJob.java
License:Apache License
@Override public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException, ClassNotFoundException { final Path actualOutputPath = FileOutputFormat.getOutputPath(this); final Path stagedPath = new Path(String.format("%s/%s/staged", stagingPrefix, System.currentTimeMillis())); FileOutputFormat.setOutputPath(this, stagedPath); final Thread hook = new Thread(new Runnable() { public void run() { try { killJob();/*from w w w . j a va 2s . c o m*/ } catch (IOException e) { e.printStackTrace(); } } }); Runtime.getRuntime().addShutdownHook(hook); final boolean retVal = super.waitForCompletion(verbose); Runtime.getRuntime().removeShutdownHook(hook); if (retVal) { FileSystem fs = actualOutputPath.getFileSystem(getConfiguration()); fs.mkdirs(actualOutputPath); if (getConfiguration().getBoolean("com.linkedin.mr_kluj.delete.output.path", true)) { log.info(String.format("Deleting data at old path[%s]", actualOutputPath)); fs.delete(actualOutputPath, true); } for (FileStatus fileStatus : FSUtils.spiderPath(fs, stagedPath)) { Path thisStagedPath = fileStatus.getPath(); Path thisActualOutputPath = new Path(fileStatus.getPath().toString().replace(stagedPath.toString(), actualOutputPath.toString())); log.info(String.format("Moving from staged path[%s] to final resting place[%s]", thisStagedPath, thisActualOutputPath)); fs.mkdirs(thisActualOutputPath.getParent()); if (!fs.rename(thisStagedPath, thisActualOutputPath)) { log.info("Rename failed!"); return false; } } return true; } log.warn("retVal was false for some reason..."); return retVal; }
From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java
License:Apache License
public void run() throws Exception { LOGGER.info("Starting {}", getClass().getSimpleName()); FileSystem fs = FileSystem.get(getConf()); Path inputPathPattern = new Path(_inputSegmentDir); if (fs.exists(new Path(_stagingDir))) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(new Path(_stagingDir), true); }/*ww w . ja va 2s .co m*/ fs.mkdirs(new Path(_stagingDir)); fs.mkdirs(new Path(_stagingDir + "/input/")); if (fs.exists(new Path(_outputDir))) { LOGGER.warn("Found the output folder, deleting it"); fs.delete(new Path(_outputDir), true); } fs.mkdirs(new Path(_outputDir)); List<FileStatus> inputDataFiles = new ArrayList<FileStatus>(); FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern); for (FileStatus fileStatus : fileStatusArr) { inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath())); } for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((_stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationJob.class); job.setJobName(_jobName); job.setMapperClass(HadoopSegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema)); job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : _properties.keySet()) { job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString())); } if (_depsJarPath != null && _depsJarPath.length() > 0) { addDepsJarToDistributedCache(new Path(_depsJarPath), job); } // Submit the job for execution. job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", _stagingDir); fs.delete(new Path(_stagingDir), true); }
From source file:com.linkedin.thirdeye.bootstrap.segment.create.SegmentCreationPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationPhaseJob.class); job.setJobName(name);//w ww .j a v a 2 s . com FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); String schemaPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEMA_PATH); LOGGER.info("Schema path : {}", schemaPath); String configPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_CONFIG_PATH); LOGGER.info("Config path : {}", configPath); Schema dataSchema = createSchema(configPath); LOGGER.info("Data schema : {}", dataSchema); String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH); LOGGER.info("Input path : {}", inputSegmentDir); String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH); LOGGER.info("Output path : {}", outputDir); String stagingDir = new File(outputDir, TEMP).getAbsolutePath(); LOGGER.info("Staging dir : {}", stagingDir); String tableName = getAndSetConfiguration(configuration, SEGMENT_CREATION_SEGMENT_TABLE_NAME); LOGGER.info("Segment table name : {}", tableName); // Create temporary directory if (fs.exists(new Path(stagingDir))) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(new Path(stagingDir), true); } fs.mkdirs(new Path(stagingDir)); fs.mkdirs(new Path(stagingDir + "/input/")); if (fs.exists(new Path(outputDir))) { LOGGER.warn("Found the output folder deleting it"); fs.delete(new Path(outputDir), true); } fs.mkdirs(new Path(outputDir)); Path inputPathPattern = new Path(inputSegmentDir); List<FileStatus> inputDataFiles = Arrays.asList(fs.listStatus(inputPathPattern)); LOGGER.info("size {}", inputDataFiles.size()); try { for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); LOGGER.info("wrote {}", completeFilePath); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } } catch (Exception e) { LOGGER.error("Exception while reading input files ", e); } job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set("data.schema", OBJECT_MAPPER.writeValueAsString(dataSchema)); if (!fs.exists(new Path(schemaPath))) { OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValue(fs.create(new Path(schemaPath), false), dataSchema); } job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : props.keySet()) { job.getConfiguration().set(key.toString(), props.getProperty(key.toString())); } job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", stagingDir); fs.delete(new Path(stagingDir), true); return job; }
From source file:com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationPhaseJob.class); job.setJobName(name);/*from www .j a v a 2s . c o m*/ FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH); LOGGER.info("Input path : {}", inputSegmentDir); Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputSegmentDir); LOGGER.info("Schema : {}", avroSchema); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode()); String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH); LOGGER.info("Output path : {}", outputDir); Path stagingDir = new Path(outputDir, TEMP); LOGGER.info("Staging dir : {}", stagingDir); String segmentWallClockStart = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_START_TIME); LOGGER.info("Segment wallclock start time : {}", segmentWallClockStart); String segmentWallClockEnd = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_END_TIME); LOGGER.info("Segment wallclock end time : {}", segmentWallClockEnd); String schedule = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEDULE); LOGGER.info("Segment schedule : {}", schedule); String isBackfill = props.getProperty(SEGMENT_CREATION_BACKFILL.toString(), DEFAULT_BACKFILL); configuration.set(SEGMENT_CREATION_BACKFILL.toString(), isBackfill); LOGGER.info("Is Backfill : {}", configuration.get(SEGMENT_CREATION_BACKFILL.toString())); // Create temporary directory if (fs.exists(stagingDir)) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(stagingDir, true); } fs.mkdirs(stagingDir); fs.mkdirs(new Path(stagingDir + "/input/")); // Create output directory if (fs.exists(new Path(outputDir))) { LOGGER.warn("Found the output folder deleting it"); fs.delete(new Path(outputDir), true); } fs.mkdirs(new Path(outputDir)); // Read input files List<FileStatus> inputDataFiles = new ArrayList<>(); for (String input : inputSegmentDir.split(",")) { Path inputPathPattern = new Path(input); inputDataFiles.addAll(Arrays.asList(fs.listStatus(inputPathPattern))); } LOGGER.info("size {}", inputDataFiles.size()); try { for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); LOGGER.info("wrote {}", completeFilePath); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } } catch (Exception e) { LOGGER.error("Exception while reading input files ", e); } job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set(SEGMENT_CREATION_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : props.keySet()) { job.getConfiguration().set(key.toString(), props.getProperty(key.toString())); } job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", stagingDir); fs.delete(stagingDir, true); return job; }
From source file:com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJob.java
License:Apache License
@Override public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException, ClassNotFoundException { final Path actualOutputPath = FileOutputFormat.getOutputPath(this); final Path stagedPath = new Path(String.format("%s/%s/staged", stagingPrefix, System.currentTimeMillis())); FileOutputFormat.setOutputPath(this, stagedPath); final Thread hook = new Thread(new Runnable() { @Override//from w w w .ja v a 2s . c om public void run() { try { killJob(); } catch (IOException e) { e.printStackTrace(); } } }); Runtime.getRuntime().addShutdownHook(hook); final boolean retVal = super.waitForCompletion(verbose); Runtime.getRuntime().removeShutdownHook(hook); if (retVal) { FileSystem fs = actualOutputPath.getFileSystem(getConfiguration()); fs.mkdirs(actualOutputPath); log.info(String.format("Deleting data at old path[%s]", actualOutputPath)); fs.delete(actualOutputPath, true); log.info(String.format("Moving from staged path[%s] to final resting place[%s]", stagedPath, actualOutputPath)); return fs.rename(stagedPath, actualOutputPath); } log.warn("retVal was false for some reason..."); return retVal; }
From source file:com.liveramp.cascading_ext.FileSystemHelper.java
License:Apache License
/** * Safely renames a path by retrying the operation <code>numTries</code> times * and sleeping <code>delayBetweenTries</code> seconds between each try. If it * still fails, it throws an IOException. * * @param fs the filesystem object * @param src the directory to be renamed * @param dst the new name of the directory * @param numTries number of tries to attempt the operation * @param delayBetweenTries the sleep delta between tries in millis * @throws IOException/*from w w w. j a va2s . co m*/ */ public static void safeRename(FileSystem fs, Path src, Path dst, int numTries, long delayBetweenTries) throws IOException { while (numTries-- > 0) { if (fs.rename(src, dst)) { return; } else { try { Thread.sleep(delayBetweenTries); } catch (InterruptedException ie) { throw new RuntimeException(ie); } } } throw new IOException("Could not rename the file from \"" + src + "\" to \"" + dst + "\"!"); }
From source file:com.liveramp.hank.hadoop.DomainBuilderAbstractOutputFormat.java
License:Apache License
public static void moveContentsAndDelete(Path srcDir, Path dstDir, FileSystem fs, Logger logger) throws IOException { if (!fs.exists(srcDir)) { return;/* w w w . ja v a 2 s . c om*/ } if (fs.exists(srcDir) && !fs.isDirectory(srcDir)) { throw new IllegalArgumentException(srcDir + " is not a directory"); } if (fs.exists(dstDir) && !fs.isDirectory(dstDir)) { throw new IllegalArgumentException(dstDir + " is not a directory"); } if (logger.isDebugEnabled()) { logger.debug("Moving contents of: " + srcDir + " to: " + dstDir); } FileStatus[] files = fs.listStatus(srcDir); for (FileStatus file : files) { Path sourcePath = file.getPath(); Path targetPath = new Path(dstDir, file.getPath().getName()); if (logger.isDebugEnabled()) { logger.debug("Moving: " + sourcePath + " to: " + targetPath); } if (!fs.mkdirs(targetPath.getParent())) { throw new IOException("Failed at creating directory " + targetPath.getParent()); } if (!fs.rename(sourcePath, targetPath)) { throw new IOException("Failed at renaming " + sourcePath + " to " + targetPath); } } fs.delete(srcDir); }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
/** * Closes the current part file.// w w w.ja v a 2 s . c o m * * <p> * This moves the current in-progress part file to a pending file and adds it to the list * of pending files in our bucket state. */ private void closeCurrentPartFile(BucketState<T> bucketState) throws Exception { if (bucketState.isWriterOpen) { bucketState.writer.close(); bucketState.isWriterOpen = false; } if (bucketState.currentFile != null) { Path currentPartPath = new Path(bucketState.currentFile); Path inProgressPath = new Path(currentPartPath.getParent(), inProgressPrefix + currentPartPath.getName()).suffix(inProgressSuffix); Path pendingPath = new Path(currentPartPath.getParent(), pendingPrefix + currentPartPath.getName()) .suffix(pendingSuffix); FileSystem fs = inProgressPath.getFileSystem(hadoopConf); fs.rename(inProgressPath, pendingPath); LOG.debug("Moving in-progress bucket {} to pending file {}", inProgressPath, pendingPath); bucketState.pendingFiles.add(currentPartPath.toString()); bucketState.currentFile = null; } }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
@Override public void notifyCheckpointComplete(long checkpointId) throws Exception { synchronized (state.bucketStates) { Iterator<Map.Entry<String, BucketState<T>>> it = state.bucketStates.entrySet().iterator(); while (it.hasNext()) { BucketState<T> bucketState = it.next().getValue(); synchronized (bucketState.pendingFilesPerCheckpoint) { Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet(); Set<Long> checkpointsToRemove = new HashSet<>(); for (Long pastCheckpointId : pastCheckpointIds) { if (pastCheckpointId <= checkpointId) { LOG.debug("Moving pending files to final location for checkpoint {}", pastCheckpointId); // All the pending files are buckets that have been completed but are waiting to be renamed // to their final name for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) { Path finalPath = new Path(filename); Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName()).suffix(pendingSuffix); FileSystem fs = pendingPath.getFileSystem(hadoopConf); fs.rename(pendingPath, finalPath); LOG.debug(/* w ww . j av a2s . c o m*/ "Moving pending file {} to final location having completed checkpoint {}.", pendingPath, pastCheckpointId); } checkpointsToRemove.add(pastCheckpointId); } } if (!bucketState.isWriterOpen && bucketState.pendingFiles.isEmpty()) { // We've dealt with all the pending files and the writer for this bucket is not currently open. // Therefore this bucket is currently inactive and we can remove it from our state. it.remove(); } else { for (Long toRemove : checkpointsToRemove) { bucketState.pendingFilesPerCheckpoint.remove(toRemove); } } } } } }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
@Override public void restoreState(State<T> state) { this.state = state; FileSystem fs; try {/*from w w w .j a v a 2 s .c om*/ fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration()); } catch (IOException e) { LOG.error("Error while creating FileSystem in checkpoint restore.", e); throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e); } for (BucketState<T> bucketState : state.bucketStates.values()) { // we can clean all the pending files since they where renamed to final files // after this checkpoint was successful bucketState.pendingFiles.clear(); if (bucketState.currentFile != null) { // We were writing to a file when the last checkpoint occured. This file can either // be still in-progress or became a pending file at some point after the checkpoint. // Either way, we have to truncate it back to a valid state (or write a .valid-length) // file that specifies up to which length it is valid and rename it to the final name // before starting a new bucket file. Path partPath = new Path(bucketState.currentFile); try { Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName()) .suffix(pendingSuffix); Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName()) .suffix(inProgressSuffix); if (fs.exists(partPendingPath)) { LOG.debug( "In-progress file {} has been moved to pending after checkpoint, moving to final location.", partPath); // has been moved to pending in the mean time, rename to final location fs.rename(partPendingPath, partPath); } else if (fs.exists(partInProgressPath)) { LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath); // it was still in progress, rename to final path fs.rename(partInProgressPath, partPath); } else if (fs.exists(partPath)) { LOG.debug("In-Progress file {} was already moved to final location {}.", bucketState.currentFile, partPath); } else { LOG.debug( "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, " + "it was moved to final location by a previous snapshot restore", bucketState.currentFile); } refTruncate = reflectTruncate(fs); // truncate it or write a ".valid-length" file to specify up to which point it is valid if (refTruncate != null) { LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength); // some-one else might still hold the lease from a previous try, we are // recovering, after all ... if (fs instanceof DistributedFileSystem) { DistributedFileSystem dfs = (DistributedFileSystem) fs; LOG.debug("Trying to recover file lease {}", partPath); dfs.recoverLease(partPath); boolean isclosed = dfs.isFileClosed(partPath); StopWatch sw = new StopWatch(); sw.start(); while (!isclosed) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } isclosed = dfs.isFileClosed(partPath); } } Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath, bucketState.currentFileValidLength); if (!truncated) { LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath); // we must wait for the asynchronous truncate operation to complete StopWatch sw = new StopWatch(); sw.start(); long newLen = fs.getFileStatus(partPath).getLen(); while (newLen != bucketState.currentFileValidLength) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } newLen = fs.getFileStatus(partPath).getLen(); } if (newLen != bucketState.currentFileValidLength) { throw new RuntimeException("Truncate did not truncate to right length. Should be " + bucketState.currentFileValidLength + " is " + newLen + "."); } } } else { LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath, bucketState.currentFileValidLength); Path validLengthFilePath = new Path(partPath.getParent(), validLengthPrefix + partPath.getName()).suffix(validLengthSuffix); if (!fs.exists(validLengthFilePath)) { FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath); lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength)); lengthFileOut.close(); } } // Now that we've restored the bucket to a valid state, reset the current file info bucketState.currentFile = null; bucketState.currentFileValidLength = -1; } catch (IOException e) { LOG.error("Error while restoring BucketingSink state.", e); throw new RuntimeException("Error while restoring BucketingSink state.", e); } catch (InvocationTargetException | IllegalAccessException e) { LOG.error("Cound not invoke truncate.", e); throw new RuntimeException("Could not invoke truncate.", e); } } LOG.debug("Clearing pending/in-progress files."); // Move files that are confirmed by a checkpoint but did not get moved to final location // because the checkpoint notification did not happen before a failure Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet(); LOG.debug("Moving pending files to final location on restore."); for (Long pastCheckpointId : pastCheckpointIds) { // All the pending files are buckets that have been completed but are waiting to be renamed // to their final name for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) { Path finalPath = new Path(filename); Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName()) .suffix(pendingSuffix); try { if (fs.exists(pendingPath)) { LOG.debug( "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.", pendingPath, pastCheckpointId); fs.rename(pendingPath, finalPath); } } catch (IOException e) { LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}", pendingPath, finalPath, e); throw new RuntimeException( "Error while renaming pending file " + pendingPath + " to final path " + finalPath, e); } } } synchronized (bucketState.pendingFilesPerCheckpoint) { bucketState.pendingFilesPerCheckpoint.clear(); } } // we need to get this here since open() has not yet been called int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask(); // delete pending files try { RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true); while (bucketFiles.hasNext()) { LocatedFileStatus file = bucketFiles.next(); if (file.getPath().toString().endsWith(pendingSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } if (file.getPath().toString().endsWith(inProgressSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } } } catch (IOException e) { LOG.error("Error while deleting old pending files: {}", e); throw new RuntimeException("Error while deleting old pending files.", e); } }