List of usage examples for org.apache.hadoop.fs FileStatus isDirectory
public boolean isDirectory()
From source file:org.apache.drill.exec.store.dfs.FileSelection.java
License:Apache License
public boolean containsDirectories(DrillFileSystem fs) throws IOException { if (dirStatus == StatusType.NOT_CHECKED) { dirStatus = StatusType.NO_DIRS;//w w w . j ava 2 s.co m for (final FileStatus status : getStatuses(fs)) { if (status.isDirectory()) { dirStatus = StatusType.HAS_DIRS; break; } } } return dirStatus == StatusType.HAS_DIRS; }
From source file:org.apache.drill.exec.store.dfs.FileSelection.java
License:Apache License
public FileSelection minusDirectories(DrillFileSystem fs) throws IOException { if (isExpandedFully()) { return this; }// ww w .j a v a 2s. co m Stopwatch timer = Stopwatch.createStarted(); final List<FileStatus> statuses = getStatuses(fs); final int total = statuses.size(); final Path[] paths = new Path[total]; for (int i = 0; i < total; i++) { paths[i] = statuses.get(i).getPath(); } final List<FileStatus> allStats = fs.list(true, paths); final List<FileStatus> nonDirectories = Lists .newArrayList(Iterables.filter(allStats, new Predicate<FileStatus>() { @Override public boolean apply(@Nullable FileStatus status) { return !status.isDirectory(); } })); final FileSelection fileSel = create(nonDirectories, null, selectionRoot); logger.debug("FileSelection.minusDirectories() took {} ms, numFiles: {}", timer.elapsed(TimeUnit.MILLISECONDS), total); // fileSel will be null if we query an empty folder if (fileSel != null) { fileSel.setExpandedFully(); } return fileSel; }
From source file:org.apache.drill.exec.store.parquet.FooterGatherer.java
License:Apache License
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException { final List<TimedRunnable<Footer>> readers = Lists.newArrayList(); List<Footer> foundFooters = Lists.newArrayList(); for (FileStatus status : statuses) { if (status.isDirectory()) { // first we check for summary file. FileSystem fs = status.getPath().getFileSystem(conf); final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE); if (fs.exists(summaryPath)) { FileStatus summaryStatus = fs.getFileStatus(summaryPath); foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus)); continue; }//from w ww . j a v a2s . c om // else we handle as normal file. for (FileStatus inStatus : fs.listStatus(status.getPath(), new DrillPathFilter())) { readers.add(new FooterReader(conf, inStatus)); } } else { readers.add(new FooterReader(conf, status)); } } if (!readers.isEmpty()) { foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism)); } return foundFooters; }
From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java
License:Apache License
/** * Create the parquet metadata files for the directory at the given path and for any subdirectories. * Metadata cache files written to the disk contain relative paths. Returned Pair of metadata contains absolute paths. * * @param path to the directory of the parquet table * @param fs file system//from w ww. j a v a 2 s . com * @param allColumns if set, store column metadata for all the columns * @param columnSet Set of columns for which column metadata has to be stored * @return Pair of parquet metadata. The left one is a parquet metadata for the table. The right one of the Pair is * a metadata for all subdirectories (if they are present and there are no any parquet files in the * {@code path} directory). * @throws IOException if parquet metadata can't be serialized and written to the json file */ private Pair<ParquetTableMetadata_v3, ParquetTableMetadataDirs> createMetaFilesRecursively(final Path path, FileSystem fs, boolean allColumns, Set<String> columnSet) throws IOException { Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null; List<ParquetFileMetadata_v3> metaDataList = Lists.newArrayList(); List<Path> directoryList = Lists.newArrayList(); ConcurrentHashMap<ColumnTypeMetadata_v3.Key, ColumnTypeMetadata_v3> columnTypeInfoSet = new ConcurrentHashMap<>(); Path p = path; FileStatus fileStatus = fs.getFileStatus(p); assert fileStatus.isDirectory() : "Expected directory"; final Map<FileStatus, FileSystem> childFiles = new LinkedHashMap<>(); for (final FileStatus file : DrillFileSystemUtil.listAll(fs, p, false)) { if (file.isDirectory()) { ParquetTableMetadata_v3 subTableMetadata = (createMetaFilesRecursively(file.getPath(), fs, allColumns, columnSet)).getLeft(); metaDataList.addAll(subTableMetadata.files); directoryList.addAll(subTableMetadata.directories); directoryList.add(file.getPath()); // Merge the schema from the child level into the current level //TODO: We need a merge method that merges two columns with the same name but different types columnTypeInfoSet.putAll(subTableMetadata.columnTypeInfo); } else { childFiles.put(file, fs); } } ParquetTableMetadata_v3 parquetTableMetadata = new ParquetTableMetadata_v3( SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion()); if (childFiles.size() > 0) { List<ParquetFileMetadata_v3> childFilesMetadata = getParquetFileMetadata_v3(parquetTableMetadata, childFiles, allColumns, columnSet); metaDataList.addAll(childFilesMetadata); // Note that we do not need to merge the columnInfo at this point. The columnInfo is already added // to the parquetTableMetadata. } parquetTableMetadata.directories = directoryList; parquetTableMetadata.files = metaDataList; // TODO: We need a merge method that merges two columns with the same name but different types if (parquetTableMetadata.columnTypeInfo == null) { parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>(); } parquetTableMetadata.columnTypeInfo.putAll(columnTypeInfoSet); for (String oldName : OLD_METADATA_FILENAMES) { fs.delete(new Path(p, oldName), false); } // relative paths in the metadata are only necessary for meta cache files. ParquetTableMetadata_v3 metadataTableWithRelativePaths = MetadataPathUtils .createMetadataWithRelativePaths(parquetTableMetadata, path); writeFile(metadataTableWithRelativePaths, new Path(p, METADATA_FILENAME), fs); if (directoryList.size() > 0 && childFiles.size() == 0) { ParquetTableMetadataDirs parquetTableMetadataDirsRelativePaths = new ParquetTableMetadataDirs( metadataTableWithRelativePaths.directories); writeFile(parquetTableMetadataDirsRelativePaths, new Path(p, METADATA_DIRECTORIES_FILENAME), fs); if (timer != null) { logger.debug("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS)); } ParquetTableMetadataDirs parquetTableMetadataDirs = new ParquetTableMetadataDirs(directoryList); return Pair.of(parquetTableMetadata, parquetTableMetadataDirs); } List<Path> emptyDirList = new ArrayList<>(); if (timer != null) { logger.debug("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS)); timer.stop(); } return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(emptyDirList)); }
From source file:org.apache.drill.exec.store.parquet.Metadata.java
License:Apache License
/** * Create the parquet metadata file for the directory at the given path, and for any subdirectories * * @param path/*from w w w . ja va 2 s .co m*/ * @throws IOException */ private Pair<ParquetTableMetadata_v2, ParquetTableMetadataDirs> createMetaFilesRecursively(final String path) throws IOException { Stopwatch timer = Stopwatch.createStarted(); List<ParquetFileMetadata_v2> metaDataList = Lists.newArrayList(); List<String> directoryList = Lists.newArrayList(); ConcurrentHashMap<ColumnTypeMetadata_v2.Key, ColumnTypeMetadata_v2> columnTypeInfoSet = new ConcurrentHashMap<>(); Path p = new Path(path); FileStatus fileStatus = fs.getFileStatus(p); assert fileStatus.isDirectory() : "Expected directory"; final List<FileStatus> childFiles = Lists.newArrayList(); for (final FileStatus file : fs.listStatus(p, new DrillPathFilter())) { if (file.isDirectory()) { ParquetTableMetadata_v2 subTableMetadata = (createMetaFilesRecursively(file.getPath().toString())) .getLeft(); metaDataList.addAll(subTableMetadata.files); directoryList.addAll(subTableMetadata.directories); directoryList.add(file.getPath().toString()); // Merge the schema from the child level into the current level //TODO: We need a merge method that merges two colums with the same name but different types columnTypeInfoSet.putAll(subTableMetadata.columnTypeInfo); } else { childFiles.add(file); } } ParquetTableMetadata_v2 parquetTableMetadata = new ParquetTableMetadata_v2(); if (childFiles.size() > 0) { List<ParquetFileMetadata_v2> childFilesMetadata = getParquetFileMetadata_v2(parquetTableMetadata, childFiles); metaDataList.addAll(childFilesMetadata); // Note that we do not need to merge the columnInfo at this point. The columnInfo is already added // to the parquetTableMetadata. } parquetTableMetadata.directories = directoryList; parquetTableMetadata.files = metaDataList; //TODO: We need a merge method that merges two colums with the same name but different types if (parquetTableMetadata.columnTypeInfo == null) { parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>(); } parquetTableMetadata.columnTypeInfo.putAll(columnTypeInfoSet); for (String oldname : OLD_METADATA_FILENAMES) { fs.delete(new Path(p, oldname), false); } writeFile(parquetTableMetadata, new Path(p, METADATA_FILENAME)); if (directoryList.size() > 0 && childFiles.size() == 0) { ParquetTableMetadataDirs parquetTableMetadataDirs = new ParquetTableMetadataDirs(directoryList); writeFile(parquetTableMetadataDirs, new Path(p, METADATA_DIRECTORIES_FILENAME)); logger.info("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS)); timer.stop(); return Pair.of(parquetTableMetadata, parquetTableMetadataDirs); } List<String> emptyDirList = Lists.newArrayList(); logger.info("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS)); timer.stop(); return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(emptyDirList)); }
From source file:org.apache.drill.exec.store.parquet.Metadata.java
License:Apache License
/** * Recursively get a list of files/* w ww . j a va 2 s .co m*/ * * @param fileStatus * @return * @throws IOException */ private List<FileStatus> getFileStatuses(FileStatus fileStatus) throws IOException { List<FileStatus> statuses = Lists.newArrayList(); if (fileStatus.isDirectory()) { for (FileStatus child : fs.listStatus(fileStatus.getPath(), new DrillPathFilter())) { statuses.addAll(getFileStatuses(child)); } } else { statuses.add(fileStatus); } return statuses; }
From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java
License:Apache License
/** * Create and return a new file selection based on reading the metadata cache file. * * This function also initializes a few of ParquetGroupScan's fields as appropriate. * * @param selection initial file selection * @param metaFilePath metadata cache file path * @return file selection read from cache * * @throws IOException/*ww w . j av a 2s. c o m*/ * @throws UserException when the updated selection is empty, this happens if the user selects an empty folder. */ private FileSelection initFromMetadataCache(FileSelection selection, Path metaFilePath) throws IOException { // get the metadata for the root directory by reading the metadata file // parquetTableMetadata contains the metadata for all files in the selection root folder, but we need to make sure // we only select the files that are part of selection (by setting fileSet appropriately) // get (and set internal field) the metadata for the directory by reading the metadata file this.parquetTableMetadata = Metadata.readBlockMeta(fs, metaFilePath.toString(), selection.getMetaContext()); List<FileStatus> fileStatuses = selection.getStatuses(fs); if (fileSet == null) { fileSet = Sets.newHashSet(); } final Path first = fileStatuses.get(0).getPath(); if (fileStatuses.size() == 1 && selection.getSelectionRoot().equals(first.toString())) { // we are selecting all files from selection root. Expand the file list from the cache for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) { fileSet.add(file.getPath()); } } else if (selection.isExpandedPartial() && !selection.hadWildcard() && cacheFileRoot != null) { if (selection.wasAllPartitionsPruned()) { // if all partitions were previously pruned, we only need to read 1 file (for the schema) fileSet.add(this.parquetTableMetadata.getFiles().get(0).getPath()); } else { // we are here if the selection is in the expanded_partial state (i.e it has directories). We get the // list of files from the metadata cache file that is present in the cacheFileRoot directory and populate // the fileSet. However, this is *not* the final list of files that will be scanned in execution since the // second phase of partition pruning will apply on the files and modify the file selection appropriately. for (Metadata.ParquetFileMetadata file : this.parquetTableMetadata.getFiles()) { fileSet.add(file.getPath()); } } } else { // we need to expand the files from fileStatuses for (FileStatus status : fileStatuses) { if (status.isDirectory()) { //TODO [DRILL-4496] read the metadata cache files in parallel final Path metaPath = new Path(status.getPath(), Metadata.METADATA_FILENAME); final Metadata.ParquetTableMetadataBase metadata = Metadata.readBlockMeta(fs, metaPath.toString(), selection.getMetaContext()); for (Metadata.ParquetFileMetadata file : metadata.getFiles()) { fileSet.add(file.getPath()); } } else { final Path path = Path.getPathWithoutSchemeAndAuthority(status.getPath()); fileSet.add(path.toString()); } } } if (fileSet.isEmpty()) { // no files were found, most likely we tried to query some empty sub folders throw UserException.validationError().message("The table you tried to query is empty").build(logger); } List<String> fileNames = Lists.newArrayList(fileSet); // when creating the file selection, set the selection root without the URI prefix // The reason is that the file names above have been created in the form // /a/b/c.parquet and the format of the selection root must match that of the file names // otherwise downstream operations such as partition pruning can break. final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(new Path(selection.getSelectionRoot())); this.selectionRoot = metaRootPath.toString(); // Use the FileSelection constructor directly here instead of the FileSelection.create() method // because create() changes the root to include the scheme and authority; In future, if create() // is the preferred way to instantiate a file selection, we may need to do something different... // WARNING: file statuses and file names are inconsistent FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames, metaRootPath.toString(), cacheFileRoot, selection.wasAllPartitionsPruned()); newSelection.setExpandedFully(); newSelection.setMetaContext(selection.getMetaContext()); return newSelection; }
From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java
License:Apache License
private void getFiles(String path, List<FileStatus> fileStatuses) throws IOException { Path p = Path.getPathWithoutSchemeAndAuthority(new Path(path)); FileStatus fileStatus = fs.getFileStatus(p); if (fileStatus.isDirectory()) { for (FileStatus f : fs.listStatus(p, new DrillPathFilter())) { getFiles(f.getPath().toString(), fileStatuses); }// w ww . java2s .c o m } else { fileStatuses.add(fileStatus); } }
From source file:org.apache.drill.exec.util.FileSystemUtil.java
License:Apache License
/** * Checks if file status is applicable based on file system object {@link Scope}. * * @param status file status/*ww w.j a v a2s .c o m*/ * @param scope file system objects scope * @return true if status is applicable, false otherwise */ private static boolean isStatusApplicable(FileStatus status, Scope scope) { switch (scope) { case DIRECTORIES: return status.isDirectory(); case FILES: return status.isFile(); case ALL: return true; default: return false; } }
From source file:org.apache.falcon.entity.parser.ClusterEntityParserTest.java
License:Apache License
/** * A lightweight unit test for a cluster where location type working is missing. * It should automatically get generated * Extensive tests are found in ClusterEntityValidationIT. *//*w w w. ja v a 2s. c o m*/ @Test public void testClusterWithOnlyStaging() throws Exception { ClusterEntityParser clusterEntityParser = Mockito .spy((ClusterEntityParser) EntityParserFactory.getParser(EntityType.CLUSTER)); Cluster cluster = (Cluster) this.dfsCluster.getCluster().copy(); Locations locations = getClusterLocations("staging2", null); cluster.setLocations(locations); Mockito.doNothing().when(clusterEntityParser).validateWorkflowInterface(cluster); Mockito.doNothing().when(clusterEntityParser).validateMessagingInterface(cluster); Mockito.doNothing().when(clusterEntityParser).validateRegistryInterface(cluster); String stagingPath = ClusterHelper.getLocation(cluster, ClusterLocationType.STAGING).getPath(); this.dfsCluster.getFileSystem().mkdirs(new Path(stagingPath), HadoopClientFactory.ALL_PERMISSION); clusterEntityParser.validate(cluster); String workingDirPath = cluster.getLocations().getLocations().get(0).getPath() + "/working"; Assert.assertEquals(ClusterHelper.getLocation(cluster, ClusterLocationType.WORKING).getPath(), workingDirPath); FileStatus workingDirStatus = this.dfsCluster.getFileSystem().getFileLinkStatus(new Path(workingDirPath)); Assert.assertTrue(workingDirStatus.isDirectory()); Assert.assertEquals(workingDirStatus.getPermission(), HadoopClientFactory.READ_EXECUTE_PERMISSION); Assert.assertEquals(workingDirStatus.getOwner(), UserGroupInformation.getLoginUser().getShortUserName()); FileStatus emptyDirStatus = this.dfsCluster.getFileSystem() .getFileStatus(new Path(stagingPath + "/" + ClusterHelper.EMPTY_DIR_NAME)); Assert.assertEquals(emptyDirStatus.getPermission(), HadoopClientFactory.READ_ONLY_PERMISSION); Assert.assertEquals(emptyDirStatus.getOwner(), UserGroupInformation.getLoginUser().getShortUserName()); String stagingSubdirFeed = cluster.getLocations().getLocations().get(0).getPath() + "/falcon/workflows/feed"; String stagingSubdirProcess = cluster.getLocations().getLocations().get(0).getPath() + "/falcon/workflows/process"; FileStatus stagingSubdirFeedStatus = this.dfsCluster.getFileSystem() .getFileLinkStatus(new Path(stagingSubdirFeed)); FileStatus stagingSubdirProcessStatus = this.dfsCluster.getFileSystem() .getFileLinkStatus(new Path(stagingSubdirProcess)); Assert.assertTrue(stagingSubdirFeedStatus.isDirectory()); Assert.assertEquals(stagingSubdirFeedStatus.getPermission(), HadoopClientFactory.ALL_PERMISSION); Assert.assertTrue(stagingSubdirProcessStatus.isDirectory()); Assert.assertEquals(stagingSubdirProcessStatus.getPermission(), HadoopClientFactory.ALL_PERMISSION); }