Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:gobblin.compaction.mapreduce.avro.AvroKeyCompactorOutputCommitter.java

License:Apache License

/**
 * Commits the task, moving files to their final committed location by delegating to
 * {@link FileOutputCommitter} to perform the actual moving. First, renames the
 * files to include the count of records contained within the file and a timestamp,
 * in the form {recordCount}.{timestamp}.avro. Then, the files are moved to their
 * committed location./*from w  w  w.ja  v  a2s.  com*/
 */
@Override
public void commitTask(TaskAttemptContext context) throws IOException {
    Path workPath = getWorkPath();
    FileSystem fs = workPath.getFileSystem(context.getConfiguration());

    if (fs.exists(workPath)) {
        long recordCount = getRecordCountFromCounter(context, AvroKeyDedupReducer.EVENT_COUNTER.RECORD_COUNT);
        String fileNamePrefix;
        if (recordCount == 0) {

            // recordCount == 0 indicates that it is a map-only, non-dedup job, and thus record count should
            // be obtained from mapper counter.
            fileNamePrefix = CompactionRecordCountProvider.M_OUTPUT_FILE_PREFIX;
            recordCount = getRecordCountFromCounter(context, AvroKeyMapper.EVENT_COUNTER.RECORD_COUNT);
        } else {
            fileNamePrefix = CompactionRecordCountProvider.MR_OUTPUT_FILE_PREFIX;
        }
        String fileName = CompactionRecordCountProvider.constructFileName(fileNamePrefix, recordCount);

        for (FileStatus status : fs.listStatus(workPath, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return FilenameUtils.isExtension(path.getName(), "avro");
            }
        })) {
            Path newPath = new Path(status.getPath().getParent(), fileName);
            LOG.info(String.format("Renaming %s to %s", status.getPath(), newPath));
            fs.rename(status.getPath(), newPath);
        }
    }

    super.commitTask(context);
}

From source file:gobblin.compaction.mapreduce.MRCompactorJobRunner.java

License:Apache License

/**
 * Get the list of file {@link Path}s in the given dataDir, which satisfy the extension requirements
 *  of {@link #getApplicableFileExtensions()}.
 *//*from  www. j  ava  2 s .c o  m*/
private List<Path> getApplicableFilePaths(final Path dataDir) throws IOException {
    try {
        return applicablePathCache.get(dataDir, new Callable<List<Path>>() {

            @Override
            public List<Path> call() throws Exception {
                if (!MRCompactorJobRunner.this.fs.exists(dataDir)) {
                    return Lists.newArrayList();
                }
                List<Path> paths = Lists.newArrayList();
                for (FileStatus fileStatus : FileListUtils.listFilesRecursively(MRCompactorJobRunner.this.fs,
                        dataDir, new PathFilter() {
                            @Override
                            public boolean accept(Path path) {
                                for (String validExtention : getApplicableFileExtensions()) {
                                    if (path.getName().endsWith(validExtention)) {
                                        return true;
                                    }
                                }
                                return false;
                            }
                        })) {
                    paths.add(fileStatus.getPath());
                }
                return paths;
            }
        });
    } catch (ExecutionException e) {
        throw new IOException(e);
    }
}

From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java

License:Apache License

@Test(enabled = false)
public void testListStatusPathWithFilter() throws IOException, URISyntaxException {
    HDFSRoot hdfsRoot = new HDFSRoot("/tmp/ListStatusPathWithFilter");
    MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem
            .get(new URI(instrumentedURI), new Configuration());
    FileStatus[] status = fs.listStatus(hdfsRoot.getDirPath3(), new PathFilter() {
        @Override// w  ww.  j  ava 2  s. c  om
        public boolean accept(Path path) {
            return path.toString().endsWith(".ext");
        }
    });
    Assert.assertEquals(fs.listStatusTimer.getCount(), 1);
    Assert.assertEquals(status.length, 2);
    hdfsRoot.cleanupRoot();
}

From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java

License:Apache License

@Test(enabled = false)
public void testListStatusPathsWithFilter() throws IOException, URISyntaxException {
    HDFSRoot hdfsRoot = new HDFSRoot("/tmp/ListStatusPathsWithFilter");
    MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem
            .get(new URI(instrumentedURI), new Configuration());

    Path[] paths = { hdfsRoot.filePath2, hdfsRoot.dirPath2, hdfsRoot.dirPath3 };
    FileStatus[] status = fs.listStatus(paths, new PathFilter() {
        @Override// w  ww  .  j a va 2  s  .c  o m
        public boolean accept(Path path) {
            return path.toString().endsWith(".ext");
        }
    });

    Assert.assertEquals(fs.listStatusTimer.getCount(), 3);
    Assert.assertEquals(status.length, 2);
    hdfsRoot.cleanupRoot();
}

From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java

License:Apache License

@Test(enabled = false)
public void testGlobStatusWithFilter() throws IOException, URISyntaxException {
    HDFSRoot hdfsRoot = new HDFSRoot("/tmp/GlobStatusWithFilter");
    MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem
            .get(new URI(instrumentedURI), new Configuration());

    FileStatus[] status = fs.globStatus(new Path("/tmp/GlobStatusWithFilter/*/*"), new PathFilter() {
        @Override/*from ww w.ja v  a 2s  .  c  om*/
        public boolean accept(Path path) {
            return path.toString().endsWith(".ext");
        }
    });
    Assert.assertEquals(fs.globStatusTimer.getCount(), 1);
    Assert.assertEquals(status.length, 2);
    hdfsRoot.cleanupRoot();
}

From source file:gobblin.hive.policy.HiveSnapshotRegistrationPolicy.java

License:Apache License

/**
 * Get the latest snapshot in the given {@link Path}.
 *
 * <p>/*from   www  .ja  v a2  s.  c  o m*/
 *   The lastest snapshot is a sub-directory of the input {@link Path} that has the largest folder
 *   name alphabetically. If property {@link #SNAPSHOT_PATH_PATTERN} is set, only those sub-directories
 *   whose full path matches the given pattern are considered.
 * </p>
 */
protected Path getLatestSnapshot(Path path) throws IOException {
    FileStatus statuses[] = this.fs.listStatus(path, new PathFilter() {

        @Override
        public boolean accept(Path p) {
            try {
                if (!HiveSnapshotRegistrationPolicy.this.fs.isDirectory(p)) {
                    return false;
                }
            } catch (IOException e) {
                throw Throwables.propagate(e);
            }

            return !HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.isPresent()
                    || HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.get().matcher(p.toString())
                            .matches();
        }
    });

    if (statuses.length == 0) {
        return null;
    }

    Arrays.sort(statuses, new Comparator<FileStatus>() {

        @Override
        public int compare(FileStatus o1, FileStatus o2) {
            return o2.getPath().getName().compareTo(o1.getPath().getName());
        }

    });

    return statuses[0].getPath();
}

From source file:gobblin.runtime.FsDatasetStateStore.java

License:Apache License

/**
 * Get a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s.
 *
 * @param jobName the job name//from  w w w  .j  a va2 s  . co  m
 * @return a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s
 * @throws IOException if there's something wrong reading the {@link JobState.DatasetState}s
 */
public Map<String, JobState.DatasetState> getLatestDatasetStatesByUrns(final String jobName)
        throws IOException {
    Path stateStorePath = new Path(this.storeRootDir, jobName);
    if (!this.fs.exists(stateStorePath)) {
        return ImmutableMap.of();
    }

    FileStatus[] stateStoreFileStatuses = this.fs.listStatus(stateStorePath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName()
                    .endsWith(CURRENT_DATASET_STATE_FILE_SUFFIX + DATASET_STATE_STORE_TABLE_SUFFIX);
        }
    });

    if (stateStoreFileStatuses == null || stateStoreFileStatuses.length == 0) {
        return ImmutableMap.of();
    }

    final Map<String, JobState.DatasetState> datasetStatesByUrns = new ConcurrentHashMap<>();

    Iterator<Callable<Void>> callableIterator = Iterators.transform(
            Arrays.asList(stateStoreFileStatuses).iterator(), new Function<FileStatus, Callable<Void>>() {
                @Override
                public Callable<Void> apply(final FileStatus stateStoreFileStatus) {
                    return new Callable<Void>() {
                        @Override
                        public Void call() throws Exception {
                            Path stateStoreFilePath = stateStoreFileStatus.getPath();
                            LOGGER.info("Getting dataset states from: {}", stateStoreFilePath);
                            List<JobState.DatasetState> previousDatasetStates = getAll(jobName,
                                    stateStoreFilePath.getName());
                            if (!previousDatasetStates.isEmpty()) {
                                // There should be a single dataset state on the list if the list is not empty
                                JobState.DatasetState previousDatasetState = previousDatasetStates.get(0);
                                datasetStatesByUrns.put(previousDatasetState.getDatasetUrn(),
                                        previousDatasetState);
                            }
                            return null;
                        }
                    };
                }
            });

    try {
        List<Either<Void, ExecutionException>> results = new IteratorExecutor<>(callableIterator,
                this.threadPoolOfGettingDatasetState, ExecutorsUtils.newDaemonThreadFactory(Optional.of(LOGGER),
                        Optional.of("GetFsDatasetStateStore-"))).executeAndGetResults();
        int maxNumberOfErrorLogs = 10;
        IteratorExecutor.logFailures(results, LOGGER, maxNumberOfErrorLogs);
    } catch (InterruptedException e) {
        throw new IOException("Failed to get latest dataset states.", e);
    }

    // The dataset (job) state from the deprecated "current.jst" will be read even though
    // the job has transitioned to the new dataset-based mechanism
    if (datasetStatesByUrns.size() > 1) {
        datasetStatesByUrns.remove(ConfigurationKeys.DEFAULT_DATASET_URN);
    }

    return datasetStatesByUrns;
}

From source file:gobblin.runtime.local.LocalJobManager.java

License:Open Source License

/**
 * Restore the lastJobIdMap.// w  w w  . ja v a2s  . co m
 */
private void restoreLastJobIdMap() throws IOException {
    FileSystem fs = FileSystem.get(
            URI.create(this.properties.getProperty(ConfigurationKeys.STATE_STORE_FS_URI_KEY)),
            new Configuration());

    // Root directory of task states store
    Path taskStateStoreRootDir = new Path(
            this.properties.getProperty(ConfigurationKeys.STATE_STORE_ROOT_DIR_KEY));
    if (!fs.exists(taskStateStoreRootDir)) {
        return;
    }

    // List subdirectories (one for each job) under the root directory
    FileStatus[] rootStatuses = fs.listStatus(taskStateStoreRootDir);
    if (rootStatuses == null || rootStatuses.length == 0) {
        return;
    }

    LOG.info("Restoring the mapping between jobs and IDs of their last runs");

    for (FileStatus status : rootStatuses) {
        // List the task states files under each subdirectory corresponding to a job
        FileStatus[] statuses = fs.listStatus(status.getPath(), new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return !path.getName().startsWith("current")
                        && path.getName().endsWith(TASK_STATE_STORE_TABLE_SUFFIX);
            }
        });

        if (statuses == null || statuses.length == 0) {
            continue;
        }

        // Sort the task states files by timestamp in descending order
        Arrays.sort(statuses, new Comparator<FileStatus>() {
            @Override
            public int compare(FileStatus fileStatus1, FileStatus fileStatus2) {
                String fileName1 = fileStatus1.getPath().getName();
                String taskId1 = fileName1.substring(0, fileName1.indexOf('.'));
                String fileName2 = fileStatus2.getPath().getName();
                String taskId2 = fileName2.substring(0, fileName2.indexOf('.'));

                Long ts1 = Long.parseLong(taskId1.substring(taskId1.lastIndexOf('_') + 1));
                Long ts2 = Long.parseLong(taskId2.substring(taskId2.lastIndexOf('_') + 1));

                return -ts1.compareTo(ts2);
            }
        });

        // Each subdirectory is for one job, and the directory name is the job name.
        String jobName = status.getPath().getName();
        // The first task states file after sorting has the latest timestamp
        String fileName = statuses[0].getPath().getName();
        String lastJobId = fileName.substring(0, fileName.indexOf('.'));
        LOG.info(String.format("Restored last job ID %s for job %s", lastJobId, jobName));
        this.lastJobIdMap.put(jobName, lastJobId);
    }
}

From source file:gobblin.source.DatePartitionedAvroFileSource.java

License:Apache License

/**
 * This method is to filter out the .avro files that need to be processed.
 * @return the pathFilter/*from   w  w w. j  a v  a 2  s . c o m*/
 */
private static PathFilter getFileFilter() {
    return new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(AVRO_SUFFIX);
        }
    };
}

From source file:gobblin.source.DatePartitionedDailyAvroSource.java

License:Open Source License

/**
 * This method is to filter out the .avro files that need to be processed.
 * @return the pathFilter//from  w  ww. j ava2  s.c om
 */
private PathFilter getFileFilter() {
    return new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(AVRO_SUFFIX);
        }
    };
}