Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:gobblin.compaction.mapreduce.avro.AvroKeyCompactorOutputCommitter.java

License:Apache License

/**
 * Commits the task, moving files to their final committed location by delegating to
 * {@link FileOutputCommitter} to perform the actual moving. First, renames the
 * files to include the count of records contained within the file and a timestamp,
 * in the form {recordCount}.{timestamp}.avro. Then, the files are moved to their
 * committed location./*from w  w  w.ja  v  a2s.  com*/
 */
@Override
public void commitTask(TaskAttemptContext context) throws IOException {
    Path workPath = getWorkPath();
    FileSystem fs = workPath.getFileSystem(context.getConfiguration());

    if (fs.exists(workPath)) {
        long recordCount = getRecordCountFromCounter(context, AvroKeyDedupReducer.EVENT_COUNTER.RECORD_COUNT);
        String fileNamePrefix;
        if (recordCount == 0) {

            // recordCount == 0 indicates that it is a map-only, non-dedup job, and thus record count should
            // be obtained from mapper counter.
            fileNamePrefix = CompactionRecordCountProvider.M_OUTPUT_FILE_PREFIX;
            recordCount = getRecordCountFromCounter(context, AvroKeyMapper.EVENT_COUNTER.RECORD_COUNT);
        } else {
            fileNamePrefix = CompactionRecordCountProvider.MR_OUTPUT_FILE_PREFIX;
        }
        String fileName = CompactionRecordCountProvider.constructFileName(fileNamePrefix, recordCount);

        for (FileStatus status : fs.listStatus(workPath, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return FilenameUtils.isExtension(path.getName(), "avro");
            }
        })) {
            Path newPath = new Path(status.getPath().getParent(), fileName);
            LOG.info(String.format("Renaming %s to %s", status.getPath(), newPath));
            fs.rename(status.getPath(), newPath);
        }
    }

    super.commitTask(context);
}

From source file:gobblin.compaction.mapreduce.MRCompactorJobRunner.java

License:Apache License

/**
 * Get the list of file {@link Path}s in the given dataDir, which satisfy the extension requirements
 *  of {@link #getApplicableFileExtensions()}.
 *//*from  www. j  ava  2 s .c o  m*/
private List<Path> getApplicableFilePaths(final Path dataDir) throws IOException {
    try {
        return applicablePathCache.get(dataDir, new Callable<List<Path>>() {

            @Override
            public List<Path> call() throws Exception {
                if (!MRCompactorJobRunner.this.fs.exists(dataDir)) {
                    return Lists.newArrayList();
                }
                List<Path> paths = Lists.newArrayList();
                for (FileStatus fileStatus : FileListUtils.listFilesRecursively(MRCompactorJobRunner.this.fs,
                        dataDir, new PathFilter() {
                            @Override
                            public boolean accept(Path path) {
                                for (String validExtention : getApplicableFileExtensions()) {
                                    if (path.getName().endsWith(validExtention)) {
                                        return true;
                                    }
                                }
                                return false;
                            }
                        })) {
                    paths.add(fileStatus.getPath());
                }
                return paths;
            }
        });
    } catch (ExecutionException e) {
        throw new IOException(e);
    }
}

From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java

License:Apache License

@Test(enabled = false)
public void testListStatusPathWithFilter() throws IOException, URISyntaxException {
    HDFSRoot hdfsRoot = new HDFSRoot("/tmp/ListStatusPathWithFilter");
    MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem
            .get(new URI(instrumentedURI), new Configuration());
    FileStatus[] status = fs.listStatus(hdfsRoot.getDirPath3(), new PathFilter() {
        @Override// w  ww.  j  ava 2  s. c  om
        public boolean accept(Path path) {
            return path.toString().endsWith(".ext");
        }
    });
    Assert.assertEquals(fs.listStatusTimer.getCount(), 1);
    Assert.assertEquals(status.length, 2);
    hdfsRoot.cleanupRoot();
}

From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java

License:Apache License

@Test(enabled = false)
public void testListStatusPathsWithFilter() throws IOException, URISyntaxException {
    HDFSRoot hdfsRoot = new HDFSRoot("/tmp/ListStatusPathsWithFilter");
    MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem
            .get(new URI(instrumentedURI), new Configuration());

    Path[] paths = { hdfsRoot.filePath2, hdfsRoot.dirPath2, hdfsRoot.dirPath3 };
    FileStatus[] status = fs.listStatus(paths, new PathFilter() {
        @Override// w  ww  .  j a va 2  s  .c  o m
        public boolean accept(Path path) {
            return path.toString().endsWith(".ext");
        }
    });

    Assert.assertEquals(fs.listStatusTimer.getCount(), 3);
    Assert.assertEquals(status.length, 2);
    hdfsRoot.cleanupRoot();
}

From source file:gobblin.filesystem.MetricsFileSystemInstrumentationTest.java

License:Apache License

@Test(enabled = false)
public void testGlobStatusWithFilter() throws IOException, URISyntaxException {
    HDFSRoot hdfsRoot = new HDFSRoot("/tmp/GlobStatusWithFilter");
    MetricsFileSystemInstrumentation fs = (MetricsFileSystemInstrumentation) FileSystem
            .get(new URI(instrumentedURI), new Configuration());

    FileStatus[] status = fs.globStatus(new Path("/tmp/GlobStatusWithFilter/*/*"), new PathFilter() {
        @Override/*from ww w.ja v  a 2s  .  c  om*/
        public boolean accept(Path path) {
            return path.toString().endsWith(".ext");
        }
    });
    Assert.assertEquals(fs.globStatusTimer.getCount(), 1);
    Assert.assertEquals(status.length, 2);
    hdfsRoot.cleanupRoot();
}

From source file:gobblin.hive.policy.HiveSnapshotRegistrationPolicy.java

License:Apache License

/**
 * Get the latest snapshot in the given {@link Path}.
 *
 * <p>/*from   www  .ja  v a2  s.  c  o m*/
 *   The lastest snapshot is a sub-directory of the input {@link Path} that has the largest folder
 *   name alphabetically. If property {@link #SNAPSHOT_PATH_PATTERN} is set, only those sub-directories
 *   whose full path matches the given pattern are considered.
 * </p>
 */
protected Path getLatestSnapshot(Path path) throws IOException {
    FileStatus statuses[] = this.fs.listStatus(path, new PathFilter() {

        @Override
        public boolean accept(Path p) {
            try {
                if (!HiveSnapshotRegistrationPolicy.this.fs.isDirectory(p)) {
                    return false;
                }
            } catch (IOException e) {
                throw Throwables.propagate(e);
            }

            return !HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.isPresent()
                    || HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.get().matcher(p.toString())
                            .matches();
        }
    });

    if (statuses.length == 0) {
        return null;
    }

    Arrays.sort(statuses, new Comparator<FileStatus>() {

        @Override
        public int compare(FileStatus o1, FileStatus o2) {
            return o2.getPath().getName().compareTo(o1.getPath().getName());
        }

    });

    return statuses[0].getPath();
}

From source file:gobblin.runtime.FsDatasetStateStore.java

License:Apache License

/**
 * Get a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s.
 *
 * @param jobName the job name//from  w w w  .j  a va2 s  . co  m
 * @return a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s
 * @throws IOException if there's something wrong reading the {@link JobState.DatasetState}s
 */
public Map<String, JobState.DatasetState> getLatestDatasetStatesByUrns(final String jobName)
        throws IOException {
    Path stateStorePath = new Path(this.storeRootDir, jobName);
    if (!this.fs.exists(stateStorePath)) {
        return ImmutableMap.of();
    }

    FileStatus[] stateStoreFileStatuses = this.fs.listStatus(stateStorePath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName()
                    .endsWith(CURRENT_DATASET_STATE_FILE_SUFFIX + DATASET_STATE_STORE_TABLE_SUFFIX);
        }
    });

    if (stateStoreFileStatuses == null || stateStoreFileStatuses.length == 0) {
        return ImmutableMap.of();
    }

    final Map<String, JobState.DatasetState> datasetStatesByUrns = new ConcurrentHashMap<>();

    Iterator<Callable<Void>> callableIterator = Iterators.transform(
            Arrays.asList(stateStoreFileStatuses).iterator(), new Function<FileStatus, Callable<Void>>() {
                @Override
                public Callable<Void> apply(final FileStatus stateStoreFileStatus) {
                    return new Callable<Void>() {
                        @Override
                        public Void call() throws Exception {
                            Path stateStoreFilePath = stateStoreFileStatus.getPath();
                            LOGGER.info("Getting dataset states from: {}", stateStoreFilePath);
                            List<JobState.DatasetState> previousDatasetStates = getAll(jobName,
                                    stateStoreFilePath.getName());
                            if (!previousDatasetStates.isEmpty()) {
                                // There should be a single dataset state on the list if the list is not empty
                                JobState.DatasetState previousDatasetState = previousDatasetStates.get(0);
                                datasetStatesByUrns.put(previousDatasetState.getDatasetUrn(),
                                        previousDatasetState);
                            }
                            return null;
                        }
                    };
                }
            });

    try {
        List<Either<Void, ExecutionException>> results = new IteratorExecutor<>(callableIterator,
                this.threadPoolOfGettingDatasetState, ExecutorsUtils.newDaemonThreadFactory(Optional.of(LOGGER),
                        Optional.of("GetFsDatasetStateStore-"))).executeAndGetResults();
        int maxNumberOfErrorLogs = 10;
        IteratorExecutor.logFailures(results, LOGGER, maxNumberOfErrorLogs);
    } catch (InterruptedException e) {
        throw new IOException("Failed to get latest dataset states.", e);
    }

    // The dataset (job) state from the deprecated "current.jst" will be read even though
    // the job has transitioned to the new dataset-based mechanism
    if (datasetStatesByUrns.size() > 1) {
        datasetStatesByUrns.remove(ConfigurationKeys.DEFAULT_DATASET_URN);
    }

    return datasetStatesByUrns;
}

From source file:gobblin.runtime.local.LocalJobManager.java

License:Open Source License

/**
 * Restore the lastJobIdMap.// w  w w  . ja v a2s  . co m
 */
private void restoreLastJobIdMap() throws IOException {
    FileSystem fs = FileSystem.get(
            URI.create(this.properties.getProperty(ConfigurationKeys.STATE_STORE_FS_URI_KEY)),
            new Configuration());

    // Root directory of task states store
    Path taskStateStoreRootDir = new Path(
            this.properties.getProperty(ConfigurationKeys.STATE_STORE_ROOT_DIR_KEY));
    if (!fs.exists(taskStateStoreRootDir)) {
        return;
    }

    // List subdirectories (one for each job) under the root directory
    FileStatus[] rootStatuses = fs.listStatus(taskStateStoreRootDir);
    if (rootStatuses == null || rootStatuses.length == 0) {
        return;
    }

    LOG.info("Restoring the mapping between jobs and IDs of their last runs");

    for (FileStatus status : rootStatuses) {
        // List the task states files under each subdirectory corresponding to a job
        FileStatus[] statuses = fs.listStatus(status.getPath(), new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return !path.getName().startsWith("current")
                        && path.getName().endsWith(TASK_STATE_STORE_TABLE_SUFFIX);
            }
        });

        if (statuses == null || statuses.length == 0) {
            continue;
        }

        // Sort the task states files by timestamp in descending order
        Arrays.sort(statuses, new Comparator<FileStatus>() {
            @Override
            public int compare(FileStatus fileStatus1, FileStatus fileStatus2) {
                String fileName1 = fileStatus1.getPath().getName();
                String taskId1 = fileName1.substring(0, fileName1.indexOf('.'));
                String fileName2 = fileStatus2.getPath().getName();
                String taskId2 = fileName2.substring(0, fileName2.indexOf('.'));

                Long ts1 = Long.parseLong(taskId1.substring(taskId1.lastIndexOf('_') + 1));
                Long ts2 = Long.parseLong(taskId2.substring(taskId2.lastIndexOf('_') + 1));

                return -ts1.compareTo(ts2);
            }
        });

        // Each subdirectory is for one job, and the directory name is the job name.
        String jobName = status.getPath().getName();
        // The first task states file after sorting has the latest timestamp
        String fileName = statuses[0].getPath().getName();
        String lastJobId = fileName.substring(0, fileName.indexOf('.'));
        LOG.info(String.format("Restored last job ID %s for job %s", lastJobId, jobName));
        this.lastJobIdMap.put(jobName, lastJobId);
    }
}

From source file:gobblin.source.DatePartitionedAvroFileSource.java

License:Apache License

/**
 * This method is to filter out the .avro files that need to be processed.
 * @return the pathFilter/*from   w  w w. j  a v  a 2  s . c o m*/
 */
private static PathFilter getFileFilter() {
    return new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(AVRO_SUFFIX);
        }
    };
}

From source file:gobblin.source.DatePartitionedDailyAvroSource.java

License:Open Source License

/**
 * This method is to filter out the .avro files that need to be processed.
 * @return the pathFilter//from  w  ww. j ava2  s.c om
 */
private PathFilter getFileFilter() {
    return new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(AVRO_SUFFIX);
        }
    };
}