List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:gobblin.compaction.mapreduce.avro.AvroKeyCompactorOutputCommitter.java
License:Apache License
/** * Commits the task, moving files to their final committed location by delegating to * {@link FileOutputCommitter} to perform the actual moving. First, renames the * files to include the count of records contained within the file and a timestamp, * in the form {recordCount}.{timestamp}.avro. Then, the files are moved to their * committed location./*from w w w.ja v a2s. com*/ */ @Override public void commitTask(TaskAttemptContext context) throws IOException { Path workPath = getWorkPath(); FileSystem fs = workPath.getFileSystem(context.getConfiguration()); if (fs.exists(workPath)) { long recordCount = getRecordCountFromCounter(context, AvroKeyDedupReducer.EVENT_COUNTER.RECORD_COUNT); String fileNamePrefix; if (recordCount == 0) { // recordCount == 0 indicates that it is a map-only, non-dedup job, and thus record count should // be obtained from mapper counter. fileNamePrefix = CompactionRecordCountProvider.M_OUTPUT_FILE_PREFIX; recordCount = getRecordCountFromCounter(context, AvroKeyMapper.EVENT_COUNTER.RECORD_COUNT); } else { fileNamePrefix = CompactionRecordCountProvider.MR_OUTPUT_FILE_PREFIX; } String fileName = CompactionRecordCountProvider.constructFileName(fileNamePrefix, recordCount); for (FileStatus status : fs.listStatus(workPath, new PathFilter() { @Override public boolean accept(Path path) { return FilenameUtils.isExtension(path.getName(), "avro"); } })) { Path newPath = new Path(status.getPath().getParent(), fileName); LOG.info(String.format("Renaming %s to %s", status.getPath(), newPath)); fs.rename(status.getPath(), newPath); } } super.commitTask(context); }
From source file:gobblin.compaction.mapreduce.MRCompactorJobRunner.java
License:Apache License
/** * Get the list of file {@link Path}s in the given dataDir, which satisfy the extension requirements * of {@link #getApplicableFileExtensions()}. *//*from www. j ava 2 s .c o m*/ private List<Path> getApplicableFilePaths(final Path dataDir) throws IOException { try { return applicablePathCache.get(dataDir, new Callable<List<Path>>() { @Override public List<Path> call() throws Exception { if (!MRCompactorJobRunner.this.fs.exists(dataDir)) { return Lists.newArrayList(); } List<Path> paths = Lists.newArrayList(); for (FileStatus fileStatus : FileListUtils.listFilesRecursively(MRCompactorJobRunner.this.fs, dataDir, new PathFilter() { @Override public boolean accept(Path path) { for (String validExtention : getApplicableFileExtensions()) { if (path.getName().endsWith(validExtention)) { return true; } } return false; } })) { paths.add(fileStatus.getPath()); } return paths; } }); } catch (ExecutionException e) { throw new IOException(e); } }
From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java
License:Apache License
@Test(enabled = false) public void testListStatusPathWithFilter() throws IOException, URISyntaxException { HDFSRoot hdfsRoot = new HDFSRoot("/tmp/ListStatusPathWithFilter"); MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem .get(new URI(instrumentedURI), new Configuration()); FileStatus[] status = fs.listStatus(hdfsRoot.getDirPath3(), new PathFilter() { @Override// w ww. j ava 2 s. c om public boolean accept(Path path) { return path.toString().endsWith(".ext"); } }); Assert.assertEquals(fs.listStatusTimer.getCount(), 1); Assert.assertEquals(status.length, 2); hdfsRoot.cleanupRoot(); }
From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java
License:Apache License
@Test(enabled = false) public void testListStatusPathsWithFilter() throws IOException, URISyntaxException { HDFSRoot hdfsRoot = new HDFSRoot("/tmp/ListStatusPathsWithFilter"); MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem .get(new URI(instrumentedURI), new Configuration()); Path[] paths = { hdfsRoot.filePath2, hdfsRoot.dirPath2, hdfsRoot.dirPath3 }; FileStatus[] status = fs.listStatus(paths, new PathFilter() { @Override// w ww . j a va 2 s .c o m public boolean accept(Path path) { return path.toString().endsWith(".ext"); } }); Assert.assertEquals(fs.listStatusTimer.getCount(), 3); Assert.assertEquals(status.length, 2); hdfsRoot.cleanupRoot(); }
From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java
License:Apache License
@Test(enabled = false) public void testGlobStatusWithFilter() throws IOException, URISyntaxException { HDFSRoot hdfsRoot = new HDFSRoot("/tmp/GlobStatusWithFilter"); MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem .get(new URI(instrumentedURI), new Configuration()); FileStatus[] status = fs.globStatus(new Path("/tmp/GlobStatusWithFilter/*/*"), new PathFilter() { @Override/*from ww w.ja v a 2s . c om*/ public boolean accept(Path path) { return path.toString().endsWith(".ext"); } }); Assert.assertEquals(fs.globStatusTimer.getCount(), 1); Assert.assertEquals(status.length, 2); hdfsRoot.cleanupRoot(); }
From source file:gobblin.hive.policy.HiveSnapshotRegistrationPolicy.java
License:Apache License
/** * Get the latest snapshot in the given {@link Path}. * * <p>/*from www .ja v a2 s. c o m*/ * The lastest snapshot is a sub-directory of the input {@link Path} that has the largest folder * name alphabetically. If property {@link #SNAPSHOT_PATH_PATTERN} is set, only those sub-directories * whose full path matches the given pattern are considered. * </p> */ protected Path getLatestSnapshot(Path path) throws IOException { FileStatus statuses[] = this.fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path p) { try { if (!HiveSnapshotRegistrationPolicy.this.fs.isDirectory(p)) { return false; } } catch (IOException e) { throw Throwables.propagate(e); } return !HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.isPresent() || HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.get().matcher(p.toString()) .matches(); } }); if (statuses.length == 0) { return null; } Arrays.sort(statuses, new Comparator<FileStatus>() { @Override public int compare(FileStatus o1, FileStatus o2) { return o2.getPath().getName().compareTo(o1.getPath().getName()); } }); return statuses[0].getPath(); }
From source file:gobblin.runtime.FsDatasetStateStore.java
License:Apache License
/** * Get a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s. * * @param jobName the job name//from w w w .j a va2 s . co m * @return a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s * @throws IOException if there's something wrong reading the {@link JobState.DatasetState}s */ public Map<String, JobState.DatasetState> getLatestDatasetStatesByUrns(final String jobName) throws IOException { Path stateStorePath = new Path(this.storeRootDir, jobName); if (!this.fs.exists(stateStorePath)) { return ImmutableMap.of(); } FileStatus[] stateStoreFileStatuses = this.fs.listStatus(stateStorePath, new PathFilter() { @Override public boolean accept(Path path) { return path.getName() .endsWith(CURRENT_DATASET_STATE_FILE_SUFFIX + DATASET_STATE_STORE_TABLE_SUFFIX); } }); if (stateStoreFileStatuses == null || stateStoreFileStatuses.length == 0) { return ImmutableMap.of(); } final Map<String, JobState.DatasetState> datasetStatesByUrns = new ConcurrentHashMap<>(); Iterator<Callable<Void>> callableIterator = Iterators.transform( Arrays.asList(stateStoreFileStatuses).iterator(), new Function<FileStatus, Callable<Void>>() { @Override public Callable<Void> apply(final FileStatus stateStoreFileStatus) { return new Callable<Void>() { @Override public Void call() throws Exception { Path stateStoreFilePath = stateStoreFileStatus.getPath(); LOGGER.info("Getting dataset states from: {}", stateStoreFilePath); List<JobState.DatasetState> previousDatasetStates = getAll(jobName, stateStoreFilePath.getName()); if (!previousDatasetStates.isEmpty()) { // There should be a single dataset state on the list if the list is not empty JobState.DatasetState previousDatasetState = previousDatasetStates.get(0); datasetStatesByUrns.put(previousDatasetState.getDatasetUrn(), previousDatasetState); } return null; } }; } }); try { List<Either<Void, ExecutionException>> results = new IteratorExecutor<>(callableIterator, this.threadPoolOfGettingDatasetState, ExecutorsUtils.newDaemonThreadFactory(Optional.of(LOGGER), Optional.of("GetFsDatasetStateStore-"))).executeAndGetResults(); int maxNumberOfErrorLogs = 10; IteratorExecutor.logFailures(results, LOGGER, maxNumberOfErrorLogs); } catch (InterruptedException e) { throw new IOException("Failed to get latest dataset states.", e); } // The dataset (job) state from the deprecated "current.jst" will be read even though // the job has transitioned to the new dataset-based mechanism if (datasetStatesByUrns.size() > 1) { datasetStatesByUrns.remove(ConfigurationKeys.DEFAULT_DATASET_URN); } return datasetStatesByUrns; }
From source file:gobblin.runtime.local.LocalJobManager.java
License:Open Source License
/** * Restore the lastJobIdMap.// w w w . ja v a2s . co m */ private void restoreLastJobIdMap() throws IOException { FileSystem fs = FileSystem.get( URI.create(this.properties.getProperty(ConfigurationKeys.STATE_STORE_FS_URI_KEY)), new Configuration()); // Root directory of task states store Path taskStateStoreRootDir = new Path( this.properties.getProperty(ConfigurationKeys.STATE_STORE_ROOT_DIR_KEY)); if (!fs.exists(taskStateStoreRootDir)) { return; } // List subdirectories (one for each job) under the root directory FileStatus[] rootStatuses = fs.listStatus(taskStateStoreRootDir); if (rootStatuses == null || rootStatuses.length == 0) { return; } LOG.info("Restoring the mapping between jobs and IDs of their last runs"); for (FileStatus status : rootStatuses) { // List the task states files under each subdirectory corresponding to a job FileStatus[] statuses = fs.listStatus(status.getPath(), new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("current") && path.getName().endsWith(TASK_STATE_STORE_TABLE_SUFFIX); } }); if (statuses == null || statuses.length == 0) { continue; } // Sort the task states files by timestamp in descending order Arrays.sort(statuses, new Comparator<FileStatus>() { @Override public int compare(FileStatus fileStatus1, FileStatus fileStatus2) { String fileName1 = fileStatus1.getPath().getName(); String taskId1 = fileName1.substring(0, fileName1.indexOf('.')); String fileName2 = fileStatus2.getPath().getName(); String taskId2 = fileName2.substring(0, fileName2.indexOf('.')); Long ts1 = Long.parseLong(taskId1.substring(taskId1.lastIndexOf('_') + 1)); Long ts2 = Long.parseLong(taskId2.substring(taskId2.lastIndexOf('_') + 1)); return -ts1.compareTo(ts2); } }); // Each subdirectory is for one job, and the directory name is the job name. String jobName = status.getPath().getName(); // The first task states file after sorting has the latest timestamp String fileName = statuses[0].getPath().getName(); String lastJobId = fileName.substring(0, fileName.indexOf('.')); LOG.info(String.format("Restored last job ID %s for job %s", lastJobId, jobName)); this.lastJobIdMap.put(jobName, lastJobId); } }
From source file:gobblin.source.DatePartitionedAvroFileSource.java
License:Apache License
/** * This method is to filter out the .avro files that need to be processed. * @return the pathFilter/*from w w w. j a v a 2 s . c o m*/ */ private static PathFilter getFileFilter() { return new PathFilter() { @Override public boolean accept(Path path) { return path.getName().endsWith(AVRO_SUFFIX); } }; }
From source file:gobblin.source.DatePartitionedDailyAvroSource.java
License:Open Source License
/** * This method is to filter out the .avro files that need to be processed. * @return the pathFilter//from w ww. j ava2 s.c om */ private PathFilter getFileFilter() { return new PathFilter() { @Override public boolean accept(Path path) { return path.getName().endsWith(AVRO_SUFFIX); } }; }