Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:org.apache.falcon.entity.EntityUtil.java

License:Apache License

/**
 * Gets the latest staging path for an entity on a cluster, based on the dir name(that contains timestamp).
 * @param cluster// w ww. j  av  a2  s  .c  om
 * @param entity
 * @return
 * @throws FalconException
 */
public static Path getLatestStagingPath(org.apache.falcon.entity.v0.cluster.Cluster cluster,
        final Entity entity) throws FalconException {
    Path basePath = getBaseStagingPath(cluster, entity);
    FileSystem fs = HadoopClientFactory.get().createProxiedFileSystem(ClusterHelper.getConfiguration(cluster));
    try {
        final String md5 = md5(getClusterView(entity, cluster.getName()));
        FileStatus[] files = fs.listStatus(basePath, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.getName().startsWith(md5);
            }
        });
        if (files != null && files.length != 0) {
            // Find the latest directory using the timestamp used in the dir name
            // These files will vary only in ts suffix (as we have filtered out using a common md5 hash),
            // hence, sorting will be on timestamp.
            // FileStatus compares on Path and hence the latest will be at the end after sorting.
            Arrays.sort(files);
            return files[files.length - 1].getPath();
        }
        throw new FalconException("No staging directories found for entity " + entity.getName() + " on cluster "
                + cluster.getName());
    } catch (Exception e) {
        throw new FalconException("Unable get listing for " + basePath.toString(), e);
    }
}

From source file:org.apache.falcon.extensions.store.ExtensionStore.java

License:Apache License

public String registerExtension(final String extensionName, final String path, final String description,
        String extensionOwner) throws URISyntaxException, FalconException {
    if (!metaStore.checkIfExtensionExists(extensionName)) {
        URI uri = new URI(path);
        assertURI("Scheme", uri.getScheme());
        assertURI("Authority", uri.getAuthority());
        assertURI("Path", uri.getPath());
        FileSystem fileSystem = getHdfsFileSystem(path);
        try {/*from  w  ww  .  ja v  a  2  s  .c o  m*/
            fileSystem.listStatus(new Path(uri.getPath() + "/README"));
        } catch (IOException e) {
            LOG.error("Exception in registering Extension:{}", extensionName, e);
            throw new ValidationException("README file is not present in the " + path);
        }
        PathFilter filter = new PathFilter() {
            public boolean accept(Path file) {
                return file.getName().endsWith(".jar");
            }
        };
        FileStatus[] jarStatus;
        try {
            jarStatus = fileSystem.listStatus(new Path(uri.getPath(), "libs/build"), filter);
            if (jarStatus.length <= 0) {
                throw new ValidationException("Jars are not present in the " + uri.getPath() + "/libs/build.");
            }
        } catch (IOException e) {
            LOG.error("Exception in registering Extension:{}", extensionName, e);
            throw new ValidationException("Jars are not present in the " + uri.getPath() + "/libs/build.");
        }

        FileStatus[] propStatus;
        try {
            propStatus = fileSystem.listStatus(new Path(uri.getPath(), "META"));
            if (propStatus.length <= 0) {
                throw new ValidationException(
                        "No properties file is not present in the " + uri.getPath() + "/META" + " structure.");
            }
        } catch (IOException e) {
            LOG.error("Exception in registering Extension:{}", extensionName, e);
            throw new ValidationException(
                    "Directory is not present in the " + uri.getPath() + "/META" + " structure.");
        }
        metaStore.storeExtensionBean(extensionName, path, ExtensionType.CUSTOM, description, extensionOwner);
    } else {
        throw new ValidationException(extensionName + " already exists.");
    }
    LOG.info("Extension :" + extensionName + " registered successfully.");
    return "Extension :" + extensionName + " registered successfully.";
}

From source file:org.apache.falcon.oozie.process.ProcessExecutionWorkflowBuilder.java

License:Apache License

protected void addArchiveForCustomJars(Cluster cluster, List<String> archiveList, String lib)
        throws FalconException {
    if (StringUtils.isBlank(lib)) {
        return;/*w  w  w.j  a  v  a2s. co  m*/
    }

    String[] libPaths = lib.split(EntityUtil.WF_LIB_SEPARATOR);
    for (String path : libPaths) {
        Path libPath = new Path(path);
        try {
            final FileSystem fs = HadoopClientFactory.get()
                    .createProxiedFileSystem(ClusterHelper.getConfiguration(cluster));
            if (fs.isFile(libPath)) { // File, not a Dir
                archiveList.add(libPath.toString());
                return;
            }

            // lib path is a directory, add each file under the lib dir to archive
            final FileStatus[] fileStatuses = fs.listStatus(libPath, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    try {
                        return fs.isFile(path) && path.getName().endsWith(".jar");
                    } catch (IOException ignore) {
                        return false;
                    }
                }
            });

            for (FileStatus fileStatus : fileStatuses) {
                archiveList.add(fileStatus.getPath().toString());
            }
        } catch (IOException e) {
            throw new FalconException("Error adding archive for custom jars under: " + libPath, e);
        }
    }
}

From source file:org.apache.gobblin.compaction.dataset.DatasetHelper.java

License:Apache License

public static List<Path> getApplicableFilePaths(FileSystem fs, Path dataDir,
        final Collection<String> extensions) throws IOException {
    if (!fs.exists(dataDir)) {
        return Lists.newArrayList();
    }//  w w w  .  j  a va2 s  .c  om
    List<Path> paths = Lists.newArrayList();
    for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, dataDir, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            for (String validExtention : extensions) {
                if (path.getName().endsWith(validExtention)) {
                    return true;
                }
            }
            return false;
        }
    })) {
        paths.add(fileStatus.getPath());
    }
    return paths;
}

From source file:org.apache.gobblin.compaction.mapreduce.CompactorOutputCommitter.java

License:Apache License

/**
 * Commits the task, moving files to their final committed location by delegating to
 * {@link FileOutputCommitter} to perform the actual moving. First, renames the
 * files to include the count of records contained within the file and a timestamp,
 * in the form {recordCount}.{timestamp}.avro. Then, the files are moved to their
 * committed location./*from   w w w.  ja  v a 2  s  .c o m*/
 */
@Override
public void commitTask(TaskAttemptContext context) throws IOException {
    Path workPath = getWorkPath();
    FileSystem fs = workPath.getFileSystem(context.getConfiguration());

    if (fs.exists(workPath)) {
        long recordCount = getRecordCountFromCounter(context,
                RecordKeyDedupReducerBase.EVENT_COUNTER.RECORD_COUNT);
        String fileNamePrefix;
        if (recordCount == 0) {

            // recordCount == 0 indicates that it is a map-only, non-dedup job, and thus record count should
            // be obtained from mapper counter.
            fileNamePrefix = CompactionRecordCountProvider.M_OUTPUT_FILE_PREFIX;
            recordCount = getRecordCountFromCounter(context, RecordKeyMapperBase.EVENT_COUNTER.RECORD_COUNT);
        } else {
            fileNamePrefix = CompactionRecordCountProvider.MR_OUTPUT_FILE_PREFIX;
        }
        String fileName = CompactionRecordCountProvider.constructFileName(fileNamePrefix,
                "." + compactionFileExtension, recordCount);

        for (FileStatus status : fs.listStatus(workPath, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return FilenameUtils.isExtension(path.getName(), compactionFileExtension);
            }
        })) {
            Path newPath = new Path(status.getPath().getParent(), fileName);
            LOG.info(String.format("Renaming %s to %s", status.getPath(), newPath));
            fs.rename(status.getPath(), newPath);
        }
    }

    super.commitTask(context);
}

From source file:org.apache.gobblin.compaction.mapreduce.MRCompactorJobRunner.java

License:Apache License

/**
 * Get the list of file {@link Path}s in the given dataDir, which satisfy the extension requirements
 *  of {@link #getApplicableFileExtensions()}.
 *///from ww  w.j  a v a2  s  . co  m
private List<Path> getApplicableFilePaths(final Path dataDir, final FileSystem fs) throws IOException {
    try {
        return applicablePathCache.get(dataDir, new Callable<List<Path>>() {

            @Override
            public List<Path> call() throws Exception {
                if (!MRCompactorJobRunner.this.fs.exists(dataDir)) {
                    return Lists.newArrayList();
                }
                List<Path> paths = Lists.newArrayList();
                for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, dataDir, new PathFilter() {
                    @Override
                    public boolean accept(Path path) {
                        for (String validExtention : getApplicableFileExtensions()) {
                            if (path.getName().endsWith(validExtention)) {
                                return true;
                            }
                        }
                        return false;
                    }
                })) {
                    paths.add(fileStatus.getPath());
                }
                return paths;
            }
        });
    } catch (ExecutionException e) {
        throw new IOException(e);
    }
}

From source file:org.apache.gobblin.compaction.mapreduce.OrcCompactionTaskTest.java

License:Apache License

@Test
public void basicTest() throws Exception {
    File basePath = Files.createTempDir();
    basePath.deleteOnExit();// w  w  w  .j ava2s .  c  o m

    String minutelyPath = "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20";
    String hourlyPath = "Identity/MemberAccount/hourly/2017/04/03/10/";
    File jobDir = new File(basePath, minutelyPath);
    Assert.assertTrue(jobDir.mkdirs());

    // Write some ORC file for compaction here.
    TypeDescription schema = TypeDescription.fromString("struct<i:int,j:int>");
    OrcStruct orcStruct_0 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_0.setFieldValue("i", new IntWritable(1));
    orcStruct_0.setFieldValue("j", new IntWritable(2));

    OrcStruct orcStruct_1 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_1.setFieldValue("i", new IntWritable(1));
    orcStruct_1.setFieldValue("j", new IntWritable(2));

    OrcStruct orcStruct_2 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_2.setFieldValue("i", new IntWritable(2));
    orcStruct_2.setFieldValue("j", new IntWritable(3));

    OrcStruct orcStruct_3 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_3.setFieldValue("i", new IntWritable(4));
    orcStruct_3.setFieldValue("j", new IntWritable(5));

    File file_0 = new File(jobDir, "file_0");
    File file_1 = new File(jobDir, "file_1");
    writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), schema,
            ImmutableList.of(orcStruct_0, orcStruct_2));
    writeOrcRecordsInFile(new Path(file_1.getAbsolutePath()), schema,
            ImmutableList.of(orcStruct_1, orcStruct_3));

    // Verify execution

    // Overwrite the job configurator factory key.
    String extensionFileName = "orcavro";
    EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin("basic", basePath.getAbsolutePath().toString())
            .setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY,
                    TestCompactionOrcJobConfigurator.Factory.class.getName())
            .setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionFileName);
    JobExecutionResult execution = embeddedGobblin.run();
    Assert.assertTrue(execution.isSuccessful());

    // Result verification
    File outputDir = new File(basePath, hourlyPath);
    FileSystem fs = FileSystem.getLocal(new Configuration());
    List<FileStatus> statuses = new ArrayList<>();
    for (FileStatus status : fs.listStatus(new Path(outputDir.getAbsolutePath()), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return FilenameUtils.isExtension(path.getName(), extensionFileName);
        }
    })) {
        statuses.add(status);
    }

    Assert.assertTrue(statuses.size() == 1);
    List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
    Assert.assertEquals(result.size(), 3);
    Assert.assertEquals(result.get(0).getFieldValue("i"), new IntWritable(1));
    Assert.assertEquals(result.get(0).getFieldValue("j"), new IntWritable(2));
    Assert.assertEquals(result.get(1).getFieldValue("i"), new IntWritable(2));
    Assert.assertEquals(result.get(1).getFieldValue("j"), new IntWritable(3));
    Assert.assertEquals(result.get(2).getFieldValue("i"), new IntWritable(4));
    Assert.assertEquals(result.get(2).getFieldValue("j"), new IntWritable(5));
}

From source file:org.apache.gobblin.hive.orc.HiveOrcSerDeManager.java

License:Apache License

/**
 * Get the schema as a TypeInfo object//from   www . ja  va 2s . c  o m
 * @param path path that contains the ORC files
 * @param fs {@link FileSystem}
 * @return {@link TypeInfo} with the schema information
 * @throws IOException
 */
public TypeInfo getSchemaFromLatestFile(Path path, FileSystem fs) throws IOException {
    if (fs.isDirectory(path)) {
        List<FileStatus> files = Arrays.asList(fs.listStatus(path, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                try {
                    return ignoredFilePrefixes.stream().noneMatch(e -> path.getName().startsWith(e))
                            && fileExtensions.stream().anyMatch(e -> path.getName().endsWith(e))
                            && isORC(path, fs);
                } catch (IOException e) {
                    log.error("Error checking file for schema retrieval", e);
                    return false;
                }
            }
        }));

        if (files.size() > 0) {
            Collections.sort((files), FileListUtils.LATEST_MOD_TIME_ORDER);
        } else {
            throw new FileNotFoundException("No files in Dataset:" + path + " found for schema retrieval");
        }
        return getSchemaFromLatestFile(files.get(0).getPath(), fs);
    } else {
        return TypeInfoUtils
                .getTypeInfoFromObjectInspector(OrcFile.createReader(fs, path).getObjectInspector());
    }
}

From source file:org.apache.gobblin.runtime.FsDatasetStateStore.java

License:Apache License

/**
 * Get a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s.
 *
 * @param jobName the job name/*  ww w.  java2s  .  com*/
 * @return a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s
 * @throws IOException if there's something wrong reading the {@link JobState.DatasetState}s
 */
public Map<String, JobState.DatasetState> getLatestDatasetStatesByUrns(final String jobName)
        throws IOException {
    Path stateStorePath = new Path(this.storeRootDir, jobName);
    if (!this.fs.exists(stateStorePath)) {
        return ImmutableMap.of();
    }

    FileStatus[] stateStoreFileStatuses = this.fs.listStatus(stateStorePath, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName()
                    .endsWith(CURRENT_DATASET_STATE_FILE_SUFFIX + DATASET_STATE_STORE_TABLE_SUFFIX);
        }
    });

    if (stateStoreFileStatuses == null || stateStoreFileStatuses.length == 0) {
        return ImmutableMap.of();
    }

    final Map<String, JobState.DatasetState> datasetStatesByUrns = new ConcurrentHashMap<>();

    Iterator<Callable<Void>> callableIterator = Iterators.transform(
            Arrays.asList(stateStoreFileStatuses).iterator(), new Function<FileStatus, Callable<Void>>() {
                @Override
                public Callable<Void> apply(final FileStatus stateStoreFileStatus) {
                    return new Callable<Void>() {
                        @Override
                        public Void call() throws Exception {
                            Path stateStoreFilePath = stateStoreFileStatus.getPath();
                            LOGGER.info("Getting dataset states from: {}", stateStoreFilePath);
                            List<JobState.DatasetState> previousDatasetStates = getAll(jobName,
                                    stateStoreFilePath.getName());
                            if (!previousDatasetStates.isEmpty()) {
                                // There should be a single dataset state on the list if the list is not empty
                                JobState.DatasetState previousDatasetState = previousDatasetStates.get(0);
                                datasetStatesByUrns.put(previousDatasetState.getDatasetUrn(),
                                        previousDatasetState);
                            }
                            return null;
                        }
                    };
                }
            });

    try {
        List<Either<Void, ExecutionException>> results = new IteratorExecutor<>(callableIterator,
                this.threadPoolOfGettingDatasetState, ExecutorsUtils.newDaemonThreadFactory(Optional.of(LOGGER),
                        Optional.of("GetFsDatasetStateStore-"))).executeAndGetResults();
        int maxNumberOfErrorLogs = 10;
        IteratorExecutor.logAndThrowFailures(results, LOGGER, maxNumberOfErrorLogs);
    } catch (InterruptedException e) {
        throw new IOException("Failed to get latest dataset states.", e);
    }

    // The dataset (job) state from the deprecated "current.jst" will be read even though
    // the job has transitioned to the new dataset-based mechanism
    if (datasetStatesByUrns.size() > 1) {
        datasetStatesByUrns.remove(ConfigurationKeys.DEFAULT_DATASET_URN);
    }

    return datasetStatesByUrns;
}

From source file:org.apache.gobblin.source.DatePartitionedNestedRetriever.java

License:Apache License

/**
 * This method is to filter out files that don't need to be processed by extension
 * @return the pathFilter/*from   ww w.  j a  v a 2s. co m*/
 */
private PathFilter getFileFilter() {
    final String extension = (this.expectedExtension.startsWith(".")) ? this.expectedExtension
            : "." + this.expectedExtension;

    return new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(extension)
                    && !(schemaInSourceDir && path.getName().equals(schemaFile));
        }
    };
}