List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:org.apache.falcon.entity.EntityUtil.java
License:Apache License
/** * Gets the latest staging path for an entity on a cluster, based on the dir name(that contains timestamp). * @param cluster// w ww. j av a2 s .c om * @param entity * @return * @throws FalconException */ public static Path getLatestStagingPath(org.apache.falcon.entity.v0.cluster.Cluster cluster, final Entity entity) throws FalconException { Path basePath = getBaseStagingPath(cluster, entity); FileSystem fs = HadoopClientFactory.get().createProxiedFileSystem(ClusterHelper.getConfiguration(cluster)); try { final String md5 = md5(getClusterView(entity, cluster.getName())); FileStatus[] files = fs.listStatus(basePath, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().startsWith(md5); } }); if (files != null && files.length != 0) { // Find the latest directory using the timestamp used in the dir name // These files will vary only in ts suffix (as we have filtered out using a common md5 hash), // hence, sorting will be on timestamp. // FileStatus compares on Path and hence the latest will be at the end after sorting. Arrays.sort(files); return files[files.length - 1].getPath(); } throw new FalconException("No staging directories found for entity " + entity.getName() + " on cluster " + cluster.getName()); } catch (Exception e) { throw new FalconException("Unable get listing for " + basePath.toString(), e); } }
From source file:org.apache.falcon.extensions.store.ExtensionStore.java
License:Apache License
public String registerExtension(final String extensionName, final String path, final String description, String extensionOwner) throws URISyntaxException, FalconException { if (!metaStore.checkIfExtensionExists(extensionName)) { URI uri = new URI(path); assertURI("Scheme", uri.getScheme()); assertURI("Authority", uri.getAuthority()); assertURI("Path", uri.getPath()); FileSystem fileSystem = getHdfsFileSystem(path); try {/*from w ww . ja v a 2 s .c o m*/ fileSystem.listStatus(new Path(uri.getPath() + "/README")); } catch (IOException e) { LOG.error("Exception in registering Extension:{}", extensionName, e); throw new ValidationException("README file is not present in the " + path); } PathFilter filter = new PathFilter() { public boolean accept(Path file) { return file.getName().endsWith(".jar"); } }; FileStatus[] jarStatus; try { jarStatus = fileSystem.listStatus(new Path(uri.getPath(), "libs/build"), filter); if (jarStatus.length <= 0) { throw new ValidationException("Jars are not present in the " + uri.getPath() + "/libs/build."); } } catch (IOException e) { LOG.error("Exception in registering Extension:{}", extensionName, e); throw new ValidationException("Jars are not present in the " + uri.getPath() + "/libs/build."); } FileStatus[] propStatus; try { propStatus = fileSystem.listStatus(new Path(uri.getPath(), "META")); if (propStatus.length <= 0) { throw new ValidationException( "No properties file is not present in the " + uri.getPath() + "/META" + " structure."); } } catch (IOException e) { LOG.error("Exception in registering Extension:{}", extensionName, e); throw new ValidationException( "Directory is not present in the " + uri.getPath() + "/META" + " structure."); } metaStore.storeExtensionBean(extensionName, path, ExtensionType.CUSTOM, description, extensionOwner); } else { throw new ValidationException(extensionName + " already exists."); } LOG.info("Extension :" + extensionName + " registered successfully."); return "Extension :" + extensionName + " registered successfully."; }
From source file:org.apache.falcon.oozie.process.ProcessExecutionWorkflowBuilder.java
License:Apache License
protected void addArchiveForCustomJars(Cluster cluster, List<String> archiveList, String lib) throws FalconException { if (StringUtils.isBlank(lib)) { return;/*w w w.j a v a2s. co m*/ } String[] libPaths = lib.split(EntityUtil.WF_LIB_SEPARATOR); for (String path : libPaths) { Path libPath = new Path(path); try { final FileSystem fs = HadoopClientFactory.get() .createProxiedFileSystem(ClusterHelper.getConfiguration(cluster)); if (fs.isFile(libPath)) { // File, not a Dir archiveList.add(libPath.toString()); return; } // lib path is a directory, add each file under the lib dir to archive final FileStatus[] fileStatuses = fs.listStatus(libPath, new PathFilter() { @Override public boolean accept(Path path) { try { return fs.isFile(path) && path.getName().endsWith(".jar"); } catch (IOException ignore) { return false; } } }); for (FileStatus fileStatus : fileStatuses) { archiveList.add(fileStatus.getPath().toString()); } } catch (IOException e) { throw new FalconException("Error adding archive for custom jars under: " + libPath, e); } } }
From source file:org.apache.gobblin.compaction.dataset.DatasetHelper.java
License:Apache License
public static List<Path> getApplicableFilePaths(FileSystem fs, Path dataDir, final Collection<String> extensions) throws IOException { if (!fs.exists(dataDir)) { return Lists.newArrayList(); }// w w w . j a va2 s .c om List<Path> paths = Lists.newArrayList(); for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, dataDir, new PathFilter() { @Override public boolean accept(Path path) { for (String validExtention : extensions) { if (path.getName().endsWith(validExtention)) { return true; } } return false; } })) { paths.add(fileStatus.getPath()); } return paths; }
From source file:org.apache.gobblin.compaction.mapreduce.CompactorOutputCommitter.java
License:Apache License
/** * Commits the task, moving files to their final committed location by delegating to * {@link FileOutputCommitter} to perform the actual moving. First, renames the * files to include the count of records contained within the file and a timestamp, * in the form {recordCount}.{timestamp}.avro. Then, the files are moved to their * committed location./*from w w w. ja v a 2 s .c o m*/ */ @Override public void commitTask(TaskAttemptContext context) throws IOException { Path workPath = getWorkPath(); FileSystem fs = workPath.getFileSystem(context.getConfiguration()); if (fs.exists(workPath)) { long recordCount = getRecordCountFromCounter(context, RecordKeyDedupReducerBase.EVENT_COUNTER.RECORD_COUNT); String fileNamePrefix; if (recordCount == 0) { // recordCount == 0 indicates that it is a map-only, non-dedup job, and thus record count should // be obtained from mapper counter. fileNamePrefix = CompactionRecordCountProvider.M_OUTPUT_FILE_PREFIX; recordCount = getRecordCountFromCounter(context, RecordKeyMapperBase.EVENT_COUNTER.RECORD_COUNT); } else { fileNamePrefix = CompactionRecordCountProvider.MR_OUTPUT_FILE_PREFIX; } String fileName = CompactionRecordCountProvider.constructFileName(fileNamePrefix, "." + compactionFileExtension, recordCount); for (FileStatus status : fs.listStatus(workPath, new PathFilter() { @Override public boolean accept(Path path) { return FilenameUtils.isExtension(path.getName(), compactionFileExtension); } })) { Path newPath = new Path(status.getPath().getParent(), fileName); LOG.info(String.format("Renaming %s to %s", status.getPath(), newPath)); fs.rename(status.getPath(), newPath); } } super.commitTask(context); }
From source file:org.apache.gobblin.compaction.mapreduce.MRCompactorJobRunner.java
License:Apache License
/** * Get the list of file {@link Path}s in the given dataDir, which satisfy the extension requirements * of {@link #getApplicableFileExtensions()}. *///from ww w.j a v a2 s . co m private List<Path> getApplicableFilePaths(final Path dataDir, final FileSystem fs) throws IOException { try { return applicablePathCache.get(dataDir, new Callable<List<Path>>() { @Override public List<Path> call() throws Exception { if (!MRCompactorJobRunner.this.fs.exists(dataDir)) { return Lists.newArrayList(); } List<Path> paths = Lists.newArrayList(); for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, dataDir, new PathFilter() { @Override public boolean accept(Path path) { for (String validExtention : getApplicableFileExtensions()) { if (path.getName().endsWith(validExtention)) { return true; } } return false; } })) { paths.add(fileStatus.getPath()); } return paths; } }); } catch (ExecutionException e) { throw new IOException(e); } }
From source file:org.apache.gobblin.compaction.mapreduce.OrcCompactionTaskTest.java
License:Apache License
@Test public void basicTest() throws Exception { File basePath = Files.createTempDir(); basePath.deleteOnExit();// w w w .j ava2s . c o m String minutelyPath = "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20"; String hourlyPath = "Identity/MemberAccount/hourly/2017/04/03/10/"; File jobDir = new File(basePath, minutelyPath); Assert.assertTrue(jobDir.mkdirs()); // Write some ORC file for compaction here. TypeDescription schema = TypeDescription.fromString("struct<i:int,j:int>"); OrcStruct orcStruct_0 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_0.setFieldValue("i", new IntWritable(1)); orcStruct_0.setFieldValue("j", new IntWritable(2)); OrcStruct orcStruct_1 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_1.setFieldValue("i", new IntWritable(1)); orcStruct_1.setFieldValue("j", new IntWritable(2)); OrcStruct orcStruct_2 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_2.setFieldValue("i", new IntWritable(2)); orcStruct_2.setFieldValue("j", new IntWritable(3)); OrcStruct orcStruct_3 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_3.setFieldValue("i", new IntWritable(4)); orcStruct_3.setFieldValue("j", new IntWritable(5)); File file_0 = new File(jobDir, "file_0"); File file_1 = new File(jobDir, "file_1"); writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), schema, ImmutableList.of(orcStruct_0, orcStruct_2)); writeOrcRecordsInFile(new Path(file_1.getAbsolutePath()), schema, ImmutableList.of(orcStruct_1, orcStruct_3)); // Verify execution // Overwrite the job configurator factory key. String extensionFileName = "orcavro"; EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin("basic", basePath.getAbsolutePath().toString()) .setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY, TestCompactionOrcJobConfigurator.Factory.class.getName()) .setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionFileName); JobExecutionResult execution = embeddedGobblin.run(); Assert.assertTrue(execution.isSuccessful()); // Result verification File outputDir = new File(basePath, hourlyPath); FileSystem fs = FileSystem.getLocal(new Configuration()); List<FileStatus> statuses = new ArrayList<>(); for (FileStatus status : fs.listStatus(new Path(outputDir.getAbsolutePath()), new PathFilter() { @Override public boolean accept(Path path) { return FilenameUtils.isExtension(path.getName(), extensionFileName); } })) { statuses.add(status); } Assert.assertTrue(statuses.size() == 1); List<OrcStruct> result = readOrcFile(statuses.get(0).getPath()); Assert.assertEquals(result.size(), 3); Assert.assertEquals(result.get(0).getFieldValue("i"), new IntWritable(1)); Assert.assertEquals(result.get(0).getFieldValue("j"), new IntWritable(2)); Assert.assertEquals(result.get(1).getFieldValue("i"), new IntWritable(2)); Assert.assertEquals(result.get(1).getFieldValue("j"), new IntWritable(3)); Assert.assertEquals(result.get(2).getFieldValue("i"), new IntWritable(4)); Assert.assertEquals(result.get(2).getFieldValue("j"), new IntWritable(5)); }
From source file:org.apache.gobblin.hive.orc.HiveOrcSerDeManager.java
License:Apache License
/** * Get the schema as a TypeInfo object//from www . ja va 2s . c o m * @param path path that contains the ORC files * @param fs {@link FileSystem} * @return {@link TypeInfo} with the schema information * @throws IOException */ public TypeInfo getSchemaFromLatestFile(Path path, FileSystem fs) throws IOException { if (fs.isDirectory(path)) { List<FileStatus> files = Arrays.asList(fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { try { return ignoredFilePrefixes.stream().noneMatch(e -> path.getName().startsWith(e)) && fileExtensions.stream().anyMatch(e -> path.getName().endsWith(e)) && isORC(path, fs); } catch (IOException e) { log.error("Error checking file for schema retrieval", e); return false; } } })); if (files.size() > 0) { Collections.sort((files), FileListUtils.LATEST_MOD_TIME_ORDER); } else { throw new FileNotFoundException("No files in Dataset:" + path + " found for schema retrieval"); } return getSchemaFromLatestFile(files.get(0).getPath(), fs); } else { return TypeInfoUtils .getTypeInfoFromObjectInspector(OrcFile.createReader(fs, path).getObjectInspector()); } }
From source file:org.apache.gobblin.runtime.FsDatasetStateStore.java
License:Apache License
/** * Get a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s. * * @param jobName the job name/* ww w. java2s . com*/ * @return a {@link Map} from dataset URNs to the latest {@link JobState.DatasetState}s * @throws IOException if there's something wrong reading the {@link JobState.DatasetState}s */ public Map<String, JobState.DatasetState> getLatestDatasetStatesByUrns(final String jobName) throws IOException { Path stateStorePath = new Path(this.storeRootDir, jobName); if (!this.fs.exists(stateStorePath)) { return ImmutableMap.of(); } FileStatus[] stateStoreFileStatuses = this.fs.listStatus(stateStorePath, new PathFilter() { @Override public boolean accept(Path path) { return path.getName() .endsWith(CURRENT_DATASET_STATE_FILE_SUFFIX + DATASET_STATE_STORE_TABLE_SUFFIX); } }); if (stateStoreFileStatuses == null || stateStoreFileStatuses.length == 0) { return ImmutableMap.of(); } final Map<String, JobState.DatasetState> datasetStatesByUrns = new ConcurrentHashMap<>(); Iterator<Callable<Void>> callableIterator = Iterators.transform( Arrays.asList(stateStoreFileStatuses).iterator(), new Function<FileStatus, Callable<Void>>() { @Override public Callable<Void> apply(final FileStatus stateStoreFileStatus) { return new Callable<Void>() { @Override public Void call() throws Exception { Path stateStoreFilePath = stateStoreFileStatus.getPath(); LOGGER.info("Getting dataset states from: {}", stateStoreFilePath); List<JobState.DatasetState> previousDatasetStates = getAll(jobName, stateStoreFilePath.getName()); if (!previousDatasetStates.isEmpty()) { // There should be a single dataset state on the list if the list is not empty JobState.DatasetState previousDatasetState = previousDatasetStates.get(0); datasetStatesByUrns.put(previousDatasetState.getDatasetUrn(), previousDatasetState); } return null; } }; } }); try { List<Either<Void, ExecutionException>> results = new IteratorExecutor<>(callableIterator, this.threadPoolOfGettingDatasetState, ExecutorsUtils.newDaemonThreadFactory(Optional.of(LOGGER), Optional.of("GetFsDatasetStateStore-"))).executeAndGetResults(); int maxNumberOfErrorLogs = 10; IteratorExecutor.logAndThrowFailures(results, LOGGER, maxNumberOfErrorLogs); } catch (InterruptedException e) { throw new IOException("Failed to get latest dataset states.", e); } // The dataset (job) state from the deprecated "current.jst" will be read even though // the job has transitioned to the new dataset-based mechanism if (datasetStatesByUrns.size() > 1) { datasetStatesByUrns.remove(ConfigurationKeys.DEFAULT_DATASET_URN); } return datasetStatesByUrns; }
From source file:org.apache.gobblin.source.DatePartitionedNestedRetriever.java
License:Apache License
/** * This method is to filter out files that don't need to be processed by extension * @return the pathFilter/*from ww w. j a v a 2s. co m*/ */ private PathFilter getFileFilter() { final String extension = (this.expectedExtension.startsWith(".")) ? this.expectedExtension : "." + this.expectedExtension; return new PathFilter() { @Override public boolean accept(Path path) { return path.getName().endsWith(extension) && !(schemaInSourceDir && path.getName().equals(schemaFile)); } }; }