List of usage examples for org.apache.hadoop.fs FileStatus getModificationTime
public long getModificationTime()
From source file:gobblin.compaction.mapreduce.MRCompactorTimeBasedJobPropCreator.java
License:Open Source License
/** * Check if inputFolder contains any files which have modification times which are more * recent than the last compaction time as stored within outputFolder; return any files * which do. An empty list will be returned if all files are older than the last compaction time. *//* w w w .ja v a2 s .c om*/ private List<Path> getNewDataInFolder(Path inputFolder, Path outputFolder) throws IOException { List<Path> newFiles = Lists.newArrayList(); Path filePath = new Path(outputFolder, ConfigurationKeys.COMPACTION_COMPLETE_FILE_NAME); Closer closer = Closer.create(); try { FSDataInputStream completionFileStream = closer.register(this.fs.open(filePath)); DateTime lastCompactionTime = new DateTime(completionFileStream.readLong(), this.timeZone); for (FileStatus fstat : HadoopUtils.listStatusRecursive(this.fs, inputFolder)) { DateTime fileModificationTime = new DateTime(fstat.getModificationTime(), this.timeZone); if (fileModificationTime.isAfter(lastCompactionTime)) { newFiles.add(fstat.getPath()); } } if (!newFiles.isEmpty()) { LOG.info(String.format( "Found %d new files within folder %s which are more recent than the previous " + "compaction start time of %s.", newFiles.size(), inputFolder, lastCompactionTime)); } } catch (IOException e) { LOG.error("Failed to check for new data within folder: " + inputFolder, e); } catch (Throwable e) { throw closer.rethrow(e); } finally { closer.close(); } return newFiles; }
From source file:gobblin.data.management.copy.CopyableFileTest.java
License:Apache License
@Test public void testCopyableFileBuilderMinimumConfiguration() throws IOException { // Source//from w ww . ja va2s . com String datasetRootDir = "/data/databases/source"; Path datasetRoot = new Path(datasetRootDir); FileSystem originFS = FileSystem.getLocal(new Configuration()); Path originFile = new Path(datasetRootDir, "copyableFile"); FileStatus origin = new FileStatus(0l, false, 0, 0l, System.currentTimeMillis(), originFile); PreserveAttributes preserveAttributes = PreserveAttributes.fromMnemonicString("ugp"); // Target String targetRoot = "/data/databases/destination"; Path relativePath = PathUtils.relativizePath(originFile, datasetRoot); Path targetPath = new Path(targetRoot, relativePath); Properties properties = new Properties(); properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher"); CopyConfiguration copyConfiguration = CopyConfiguration .builder(FileSystem.getLocal(new Configuration()), properties).preserve(preserveAttributes).build(); CopyableFile copyableFile = CopyableFile.builder(originFS, origin, datasetRoot, copyConfiguration) .destination(targetPath).ancestorsOwnerAndPermission(Lists.<OwnerAndPermission>newArrayList()) // not testing ancestors .build(); // Making sure all fields are populated correctly via CopyableFile builder // Verify preserve attribute options Assert.assertEquals(copyableFile.getPreserve().toMnemonicString(), preserveAttributes.toMnemonicString()); // Verify origin Assert.assertEquals(copyableFile.getFileSet(), ""); Assert.assertEquals(copyableFile.getOrigin(), origin); // Verify destination target, permissions and other attributes Assert.assertEquals(copyableFile.getChecksum().length, 0); Assert.assertEquals(copyableFile.getDestination().toString(), targetPath.toString()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getGroup(), origin.getGroup()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getOwner(), origin.getOwner()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getFsPermission(), origin.getPermission()); // Verify auto determined timestamp Assert.assertEquals(copyableFile.getOriginTimestamp(), origin.getModificationTime()); Assert.assertEquals(copyableFile.getUpstreamTimestamp(), origin.getModificationTime()); }
From source file:gobblin.data.management.copy.hive.HiveCopyEntityHelper.java
License:Apache License
private static boolean shouldReplaceFile(FileStatus referencePath, FileStatus replacementFile) { return replacementFile.getLen() != referencePath.getLen() || referencePath.getModificationTime() < replacementFile.getModificationTime(); }
From source file:gobblin.data.management.copy.recovery.RecoveryHelper.java
License:Apache License
/** * Delete all persisted files older than the number of hours set by {@link #PERSIST_RETENTION_KEY}. * @throws IOException//from w w w . java2 s.c o m */ public void purgeOldPersistedFile() throws IOException { if (!this.persistDir.isPresent() || !this.fs.exists(this.persistDir.get())) { log.info("No persist directory to clean."); return; } long retentionMillis = TimeUnit.HOURS.toMillis(this.retentionHours); long now = System.currentTimeMillis(); for (FileStatus fileStatus : this.fs.listStatus(this.persistDir.get())) { if (now - fileStatus.getModificationTime() > retentionMillis) { if (!this.fs.delete(fileStatus.getPath(), true)) { log.warn("Failed to delete path " + fileStatus.getPath()); } } } }
From source file:gobblin.data.management.copy.RecursiveCopyableDataset.java
License:Apache License
private static boolean sameFile(FileStatus fileInSource, FileStatus fileInTarget) { return fileInTarget.getLen() == fileInSource.getLen() && fileInSource.getModificationTime() <= fileInTarget.getModificationTime(); }
From source file:gobblin.data.management.copy.replication.ConfigBasedDataset.java
License:Apache License
@Override public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException { List<CopyEntity> copyableFiles = Lists.newArrayList(); EndPoint copyFromRaw = copyRoute.getCopyFrom(); EndPoint copyToRaw = copyRoute.getCopyTo(); if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) { log.warn("Currently only handle the Hadoop Fs EndPoint replication"); return copyableFiles; }/* ww w . j av a 2s. c o m*/ if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) { log.info( "No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData()); return copyableFiles; } HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw; HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw; Configuration conf = HadoopUtils.newConfiguration(); FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf); FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf); Collection<FileStatus> allFilesInSource = copyFrom.getFiles(); Collection<FileStatus> allFilesInTarget = copyTo.getFiles(); final PathFilter pathFilter = DatasetUtils.instantiatePathFilter(this.props); Predicate<FileStatus> predicate = new Predicate<FileStatus>() { @Override public boolean apply(FileStatus input) { return pathFilter.accept(input.getPath()); } }; Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(Collections2.filter(allFilesInSource, predicate)); Map<Path, FileStatus> copyToFileMap = Maps.newHashMap(); for (FileStatus f : allFilesInTarget) { if (pathFilter.accept(f.getPath())) { copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f); } } Collection<Path> deletedPaths = Lists.newArrayList(); boolean watermarkMetadataCopied = false; boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource(); for (FileStatus originFileStatus : copyFromFileStatuses) { Path relative = PathUtils.relativizePath( PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath())); // construct the new path in the target file system Path newPath = new Path(copyTo.getDatasetPath(), relative); if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) { watermarkMetadataCopied = true; } // skip copy same file if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) { log.debug( "Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData()); } else { // need to remove those files in the target File System if (copyToFileMap.containsKey(newPath)) { deletedPaths.add(newPath); } copyableFiles.add(CopyableFile .fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration) .fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()) .build()); } // clean up already checked paths copyToFileMap.remove(newPath); } // delete the paths on target directory if NOT exists on source if (deleteTargetIfNotExistOnSource) { deletedPaths.addAll(copyToFileMap.keySet()); } // delete old files first if (!deletedPaths.isEmpty()) { DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props); copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0)); } // generate the watermark file if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) { copyableFiles .add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1)); } return copyableFiles; }
From source file:gobblin.data.management.copy.replication.SourceHadoopFsEndPoint.java
License:Apache License
@Override public synchronized Optional<ComparableWatermark> getWatermark() { if (this.initialized) { return this.cachedWatermark; }/* w w w.j ava 2 s. c om*/ this.initialized = true; try { long curTs = -1; FileSystem fs = FileSystem.get(rc.getFsURI(), new Configuration()); Collection<Path> validPaths = ReplicationDataValidPathPicker.getValidPaths(this); for (Path p : validPaths) { this.allFileStatus.addAll(FileListUtils.listFilesRecursively(fs, p)); } for (FileStatus f : this.allFileStatus) { if (f.getModificationTime() > curTs) { curTs = f.getModificationTime(); } } ComparableWatermark result = new LongWatermark(curTs); this.cachedWatermark = Optional.of(result); return this.cachedWatermark; } catch (IOException e) { log.error("Error while retrieve the watermark for " + this); return this.cachedWatermark; } }
From source file:gobblin.data.management.retention.policy.RawDatasetRetentionPolicy.java
License:Apache License
private Optional<Long> getLatestModTime(Iterable<Path> paths) throws IOException { long latestModTime = Long.MIN_VALUE; for (FileStatus status : FileListUtils.listMostNestedPathRecursively(this.fs, paths)) { latestModTime = Math.max(latestModTime, status.getModificationTime()); }//from ww w. j av a2 s . c om return latestModTime == Long.MIN_VALUE ? Optional.<Long>absent() : Optional.of(latestModTime); }
From source file:gobblin.data.management.version.finder.FileLevelTimestampVersionFinder.java
License:Apache License
@Override public Collection<TimestampedDatasetVersion> findDatasetVersions(Dataset dataset) { FileSystemDataset fsDataset = (FileSystemDataset) dataset; try {//from w ww .j av a 2 s.c o m List<TimestampedDatasetVersion> timestampedVersions = Lists.newArrayList(); for (FileStatus fileStatus : FileListUtils.listMostNestedPathRecursively(this.fs, fsDataset.datasetRoot())) { timestampedVersions.add(new TimestampedDatasetVersion( new DateTime(fileStatus.getModificationTime()), fileStatus.getPath())); } return timestampedVersions; } catch (IOException e) { LOGGER.warn("Failed to get ModifiedTimeStamp for candidate dataset version at " + fsDataset.datasetRoot() + ". Ignoring."); return Lists.newArrayList(); } }
From source file:gobblin.data.management.version.finder.ModDateTimeDatasetVersionFinder.java
License:Apache License
@Override public Collection<TimestampedDatasetVersion> findDatasetVersions(Dataset dataset) throws IOException { FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileStatus status = this.fs.getFileStatus(fsDataset.datasetRoot()); return Lists.newArrayList( new TimestampedDatasetVersion(new DateTime(status.getModificationTime()), fsDataset.datasetRoot())); }