Example usage for org.apache.hadoop.fs FileStatus isDirectory

List of usage examples for org.apache.hadoop.fs FileStatus isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus isDirectory.

Prototype

public boolean isDirectory() 

Source Link

Document

Is this a directory?

Usage

From source file:org.apache.drill.exec.store.dfs.FileSelection.java

License:Apache License

public boolean containsDirectories(DrillFileSystem fs) throws IOException {
    if (dirStatus == StatusType.NOT_CHECKED) {
        dirStatus = StatusType.NO_DIRS;//w w  w .  j  ava  2 s.co  m
        for (final FileStatus status : getStatuses(fs)) {
            if (status.isDirectory()) {
                dirStatus = StatusType.HAS_DIRS;
                break;
            }
        }
    }
    return dirStatus == StatusType.HAS_DIRS;
}

From source file:org.apache.drill.exec.store.dfs.FileSelection.java

License:Apache License

public FileSelection minusDirectories(DrillFileSystem fs) throws IOException {
    if (isExpandedFully()) {
        return this;
    }//  ww w  .j a  v  a  2s.  co  m
    Stopwatch timer = Stopwatch.createStarted();
    final List<FileStatus> statuses = getStatuses(fs);
    final int total = statuses.size();
    final Path[] paths = new Path[total];
    for (int i = 0; i < total; i++) {
        paths[i] = statuses.get(i).getPath();
    }
    final List<FileStatus> allStats = fs.list(true, paths);
    final List<FileStatus> nonDirectories = Lists
            .newArrayList(Iterables.filter(allStats, new Predicate<FileStatus>() {
                @Override
                public boolean apply(@Nullable FileStatus status) {
                    return !status.isDirectory();
                }
            }));

    final FileSelection fileSel = create(nonDirectories, null, selectionRoot);
    logger.debug("FileSelection.minusDirectories() took {} ms, numFiles: {}",
            timer.elapsed(TimeUnit.MILLISECONDS), total);

    // fileSel will be null if we query an empty folder
    if (fileSel != null) {
        fileSel.setExpandedFully();
    }

    return fileSel;
}

From source file:org.apache.drill.exec.store.parquet.FooterGatherer.java

License:Apache License

public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism)
        throws IOException {
    final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
    List<Footer> foundFooters = Lists.newArrayList();
    for (FileStatus status : statuses) {

        if (status.isDirectory()) {
            // first we check for summary file.
            FileSystem fs = status.getPath().getFileSystem(conf);

            final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
            if (fs.exists(summaryPath)) {
                FileStatus summaryStatus = fs.getFileStatus(summaryPath);
                foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
                continue;
            }//from w ww . j  a  v  a2s .  c  om

            // else we handle as normal file.
            for (FileStatus inStatus : fs.listStatus(status.getPath(), new DrillPathFilter())) {
                readers.add(new FooterReader(conf, inStatus));
            }
        } else {
            readers.add(new FooterReader(conf, status));
        }

    }
    if (!readers.isEmpty()) {
        foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
    }

    return foundFooters;
}

From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java

License:Apache License

/**
 * Create the parquet metadata files for the directory at the given path and for any subdirectories.
 * Metadata cache files written to the disk contain relative paths. Returned Pair of metadata contains absolute paths.
 *
 * @param path to the directory of the parquet table
 * @param fs file system//from  w  ww. j  a  v a 2  s  .  com
 * @param allColumns if set, store column metadata for all the columns
 * @param columnSet Set of columns for which column metadata has to be stored
 * @return Pair of parquet metadata. The left one is a parquet metadata for the table. The right one of the Pair is
 *         a metadata for all subdirectories (if they are present and there are no any parquet files in the
 *         {@code path} directory).
 * @throws IOException if parquet metadata can't be serialized and written to the json file
 */
private Pair<ParquetTableMetadata_v3, ParquetTableMetadataDirs> createMetaFilesRecursively(final Path path,
        FileSystem fs, boolean allColumns, Set<String> columnSet) throws IOException {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    List<ParquetFileMetadata_v3> metaDataList = Lists.newArrayList();
    List<Path> directoryList = Lists.newArrayList();
    ConcurrentHashMap<ColumnTypeMetadata_v3.Key, ColumnTypeMetadata_v3> columnTypeInfoSet = new ConcurrentHashMap<>();
    Path p = path;
    FileStatus fileStatus = fs.getFileStatus(p);
    assert fileStatus.isDirectory() : "Expected directory";

    final Map<FileStatus, FileSystem> childFiles = new LinkedHashMap<>();

    for (final FileStatus file : DrillFileSystemUtil.listAll(fs, p, false)) {
        if (file.isDirectory()) {
            ParquetTableMetadata_v3 subTableMetadata = (createMetaFilesRecursively(file.getPath(), fs,
                    allColumns, columnSet)).getLeft();
            metaDataList.addAll(subTableMetadata.files);
            directoryList.addAll(subTableMetadata.directories);
            directoryList.add(file.getPath());
            // Merge the schema from the child level into the current level
            //TODO: We need a merge method that merges two columns with the same name but different types
            columnTypeInfoSet.putAll(subTableMetadata.columnTypeInfo);
        } else {
            childFiles.put(file, fs);
        }
    }
    ParquetTableMetadata_v3 parquetTableMetadata = new ParquetTableMetadata_v3(
            SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion());
    if (childFiles.size() > 0) {
        List<ParquetFileMetadata_v3> childFilesMetadata = getParquetFileMetadata_v3(parquetTableMetadata,
                childFiles, allColumns, columnSet);
        metaDataList.addAll(childFilesMetadata);
        // Note that we do not need to merge the columnInfo at this point. The columnInfo is already added
        // to the parquetTableMetadata.
    }

    parquetTableMetadata.directories = directoryList;
    parquetTableMetadata.files = metaDataList;
    // TODO: We need a merge method that merges two columns with the same name but different types
    if (parquetTableMetadata.columnTypeInfo == null) {
        parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
    }
    parquetTableMetadata.columnTypeInfo.putAll(columnTypeInfoSet);

    for (String oldName : OLD_METADATA_FILENAMES) {
        fs.delete(new Path(p, oldName), false);
    }
    //  relative paths in the metadata are only necessary for meta cache files.
    ParquetTableMetadata_v3 metadataTableWithRelativePaths = MetadataPathUtils
            .createMetadataWithRelativePaths(parquetTableMetadata, path);
    writeFile(metadataTableWithRelativePaths, new Path(p, METADATA_FILENAME), fs);

    if (directoryList.size() > 0 && childFiles.size() == 0) {
        ParquetTableMetadataDirs parquetTableMetadataDirsRelativePaths = new ParquetTableMetadataDirs(
                metadataTableWithRelativePaths.directories);
        writeFile(parquetTableMetadataDirsRelativePaths, new Path(p, METADATA_DIRECTORIES_FILENAME), fs);
        if (timer != null) {
            logger.debug("Creating metadata files recursively took {} ms",
                    timer.elapsed(TimeUnit.MILLISECONDS));
        }
        ParquetTableMetadataDirs parquetTableMetadataDirs = new ParquetTableMetadataDirs(directoryList);
        return Pair.of(parquetTableMetadata, parquetTableMetadataDirs);
    }
    List<Path> emptyDirList = new ArrayList<>();
    if (timer != null) {
        logger.debug("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
    }
    return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(emptyDirList));
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Create the parquet metadata file for the directory at the given path, and for any subdirectories
 *
 * @param path/*from   w w  w . ja va 2 s  .co m*/
 * @throws IOException
 */
private Pair<ParquetTableMetadata_v2, ParquetTableMetadataDirs> createMetaFilesRecursively(final String path)
        throws IOException {
    Stopwatch timer = Stopwatch.createStarted();
    List<ParquetFileMetadata_v2> metaDataList = Lists.newArrayList();
    List<String> directoryList = Lists.newArrayList();
    ConcurrentHashMap<ColumnTypeMetadata_v2.Key, ColumnTypeMetadata_v2> columnTypeInfoSet = new ConcurrentHashMap<>();
    Path p = new Path(path);
    FileStatus fileStatus = fs.getFileStatus(p);
    assert fileStatus.isDirectory() : "Expected directory";

    final List<FileStatus> childFiles = Lists.newArrayList();

    for (final FileStatus file : fs.listStatus(p, new DrillPathFilter())) {
        if (file.isDirectory()) {
            ParquetTableMetadata_v2 subTableMetadata = (createMetaFilesRecursively(file.getPath().toString()))
                    .getLeft();
            metaDataList.addAll(subTableMetadata.files);
            directoryList.addAll(subTableMetadata.directories);
            directoryList.add(file.getPath().toString());
            // Merge the schema from the child level into the current level
            //TODO: We need a merge method that merges two colums with the same name but different types
            columnTypeInfoSet.putAll(subTableMetadata.columnTypeInfo);
        } else {
            childFiles.add(file);
        }
    }
    ParquetTableMetadata_v2 parquetTableMetadata = new ParquetTableMetadata_v2();
    if (childFiles.size() > 0) {
        List<ParquetFileMetadata_v2> childFilesMetadata = getParquetFileMetadata_v2(parquetTableMetadata,
                childFiles);
        metaDataList.addAll(childFilesMetadata);
        // Note that we do not need to merge the columnInfo at this point. The columnInfo is already added
        // to the parquetTableMetadata.
    }

    parquetTableMetadata.directories = directoryList;
    parquetTableMetadata.files = metaDataList;
    //TODO: We need a merge method that merges two colums with the same name but different types
    if (parquetTableMetadata.columnTypeInfo == null) {
        parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
    }
    parquetTableMetadata.columnTypeInfo.putAll(columnTypeInfoSet);

    for (String oldname : OLD_METADATA_FILENAMES) {
        fs.delete(new Path(p, oldname), false);
    }
    writeFile(parquetTableMetadata, new Path(p, METADATA_FILENAME));

    if (directoryList.size() > 0 && childFiles.size() == 0) {
        ParquetTableMetadataDirs parquetTableMetadataDirs = new ParquetTableMetadataDirs(directoryList);
        writeFile(parquetTableMetadataDirs, new Path(p, METADATA_DIRECTORIES_FILENAME));
        logger.info("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
        return Pair.of(parquetTableMetadata, parquetTableMetadataDirs);
    }
    List<String> emptyDirList = Lists.newArrayList();
    logger.info("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
    timer.stop();
    return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(emptyDirList));
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Recursively get a list of files/*  w  ww . j  a  va  2  s .co  m*/
 *
 * @param fileStatus
 * @return
 * @throws IOException
 */
private List<FileStatus> getFileStatuses(FileStatus fileStatus) throws IOException {
    List<FileStatus> statuses = Lists.newArrayList();
    if (fileStatus.isDirectory()) {
        for (FileStatus child : fs.listStatus(fileStatus.getPath(), new DrillPathFilter())) {
            statuses.addAll(getFileStatuses(child));
        }
    } else {
        statuses.add(fileStatus);
    }
    return statuses;
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

/**
 * Create and return a new file selection based on reading the metadata cache file.
 *
 * This function also initializes a few of ParquetGroupScan's fields as appropriate.
 *
 * @param selection initial file selection
 * @param metaFilePath metadata cache file path
 * @return file selection read from cache
 *
 * @throws IOException/*ww w .  j  av  a 2s. c  o  m*/
 * @throws UserException when the updated selection is empty, this happens if the user selects an empty folder.
 */
private FileSelection initFromMetadataCache(FileSelection selection, Path metaFilePath) throws IOException {
    // get the metadata for the root directory by reading the metadata file
    // parquetTableMetadata contains the metadata for all files in the selection root folder, but we need to make sure
    // we only select the files that are part of selection (by setting fileSet appropriately)

    // get (and set internal field) the metadata for the directory by reading the metadata file
    this.parquetTableMetadata = Metadata.readBlockMeta(fs, metaFilePath.toString(), selection.getMetaContext());
    List<FileStatus> fileStatuses = selection.getStatuses(fs);

    if (fileSet == null) {
        fileSet = Sets.newHashSet();
    }

    final Path first = fileStatuses.get(0).getPath();
    if (fileStatuses.size() == 1 && selection.getSelectionRoot().equals(first.toString())) {
        // we are selecting all files from selection root. Expand the file list from the cache
        for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }

    } else if (selection.isExpandedPartial() && !selection.hadWildcard() && cacheFileRoot != null) {
        if (selection.wasAllPartitionsPruned()) {
            // if all partitions were previously pruned, we only need to read 1 file (for the schema)
            fileSet.add(this.parquetTableMetadata.getFiles().get(0).getPath());
        } else {
            // we are here if the selection is in the expanded_partial state (i.e it has directories).  We get the
            // list of files from the metadata cache file that is present in the cacheFileRoot directory and populate
            // the fileSet. However, this is *not* the final list of files that will be scanned in execution since the
            // second phase of partition pruning will apply on the files and modify the file selection appropriately.
            for (Metadata.ParquetFileMetadata file : this.parquetTableMetadata.getFiles()) {
                fileSet.add(file.getPath());
            }
        }
    } else {
        // we need to expand the files from fileStatuses
        for (FileStatus status : fileStatuses) {
            if (status.isDirectory()) {
                //TODO [DRILL-4496] read the metadata cache files in parallel
                final Path metaPath = new Path(status.getPath(), Metadata.METADATA_FILENAME);
                final Metadata.ParquetTableMetadataBase metadata = Metadata.readBlockMeta(fs,
                        metaPath.toString(), selection.getMetaContext());
                for (Metadata.ParquetFileMetadata file : metadata.getFiles()) {
                    fileSet.add(file.getPath());
                }
            } else {
                final Path path = Path.getPathWithoutSchemeAndAuthority(status.getPath());
                fileSet.add(path.toString());
            }
        }
    }

    if (fileSet.isEmpty()) {
        // no files were found, most likely we tried to query some empty sub folders
        throw UserException.validationError().message("The table you tried to query is empty").build(logger);
    }

    List<String> fileNames = Lists.newArrayList(fileSet);

    // when creating the file selection, set the selection root without the URI prefix
    // The reason is that the file names above have been created in the form
    // /a/b/c.parquet and the format of the selection root must match that of the file names
    // otherwise downstream operations such as partition pruning can break.
    final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(new Path(selection.getSelectionRoot()));
    this.selectionRoot = metaRootPath.toString();

    // Use the FileSelection constructor directly here instead of the FileSelection.create() method
    // because create() changes the root to include the scheme and authority; In future, if create()
    // is the preferred way to instantiate a file selection, we may need to do something different...
    // WARNING: file statuses and file names are inconsistent
    FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames,
            metaRootPath.toString(), cacheFileRoot, selection.wasAllPartitionsPruned());

    newSelection.setExpandedFully();
    newSelection.setMetaContext(selection.getMetaContext());
    return newSelection;
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

private void getFiles(String path, List<FileStatus> fileStatuses) throws IOException {
    Path p = Path.getPathWithoutSchemeAndAuthority(new Path(path));
    FileStatus fileStatus = fs.getFileStatus(p);
    if (fileStatus.isDirectory()) {
        for (FileStatus f : fs.listStatus(p, new DrillPathFilter())) {
            getFiles(f.getPath().toString(), fileStatuses);
        }//  w ww  .  java2s .c o  m
    } else {
        fileStatuses.add(fileStatus);
    }
}

From source file:org.apache.drill.exec.util.FileSystemUtil.java

License:Apache License

/**
 * Checks if file status is applicable based on file system object {@link Scope}.
 *
 * @param status file status/*ww  w.j a  v  a2s .c  o  m*/
 * @param scope file system objects scope
 * @return true if status is applicable, false otherwise
 */
private static boolean isStatusApplicable(FileStatus status, Scope scope) {
    switch (scope) {
    case DIRECTORIES:
        return status.isDirectory();
    case FILES:
        return status.isFile();
    case ALL:
        return true;
    default:
        return false;
    }
}

From source file:org.apache.falcon.entity.parser.ClusterEntityParserTest.java

License:Apache License

/**
 * A lightweight unit test for a cluster where location type working is missing.
 * It should automatically get generated
 * Extensive tests are found in ClusterEntityValidationIT.
 *//*w w w.  ja v a  2s.  c  o  m*/
@Test
public void testClusterWithOnlyStaging() throws Exception {
    ClusterEntityParser clusterEntityParser = Mockito
            .spy((ClusterEntityParser) EntityParserFactory.getParser(EntityType.CLUSTER));
    Cluster cluster = (Cluster) this.dfsCluster.getCluster().copy();
    Locations locations = getClusterLocations("staging2", null);
    cluster.setLocations(locations);
    Mockito.doNothing().when(clusterEntityParser).validateWorkflowInterface(cluster);
    Mockito.doNothing().when(clusterEntityParser).validateMessagingInterface(cluster);
    Mockito.doNothing().when(clusterEntityParser).validateRegistryInterface(cluster);
    String stagingPath = ClusterHelper.getLocation(cluster, ClusterLocationType.STAGING).getPath();
    this.dfsCluster.getFileSystem().mkdirs(new Path(stagingPath), HadoopClientFactory.ALL_PERMISSION);
    clusterEntityParser.validate(cluster);
    String workingDirPath = cluster.getLocations().getLocations().get(0).getPath() + "/working";
    Assert.assertEquals(ClusterHelper.getLocation(cluster, ClusterLocationType.WORKING).getPath(),
            workingDirPath);
    FileStatus workingDirStatus = this.dfsCluster.getFileSystem().getFileLinkStatus(new Path(workingDirPath));
    Assert.assertTrue(workingDirStatus.isDirectory());
    Assert.assertEquals(workingDirStatus.getPermission(), HadoopClientFactory.READ_EXECUTE_PERMISSION);
    Assert.assertEquals(workingDirStatus.getOwner(), UserGroupInformation.getLoginUser().getShortUserName());

    FileStatus emptyDirStatus = this.dfsCluster.getFileSystem()
            .getFileStatus(new Path(stagingPath + "/" + ClusterHelper.EMPTY_DIR_NAME));
    Assert.assertEquals(emptyDirStatus.getPermission(), HadoopClientFactory.READ_ONLY_PERMISSION);
    Assert.assertEquals(emptyDirStatus.getOwner(), UserGroupInformation.getLoginUser().getShortUserName());

    String stagingSubdirFeed = cluster.getLocations().getLocations().get(0).getPath()
            + "/falcon/workflows/feed";
    String stagingSubdirProcess = cluster.getLocations().getLocations().get(0).getPath()
            + "/falcon/workflows/process";
    FileStatus stagingSubdirFeedStatus = this.dfsCluster.getFileSystem()
            .getFileLinkStatus(new Path(stagingSubdirFeed));
    FileStatus stagingSubdirProcessStatus = this.dfsCluster.getFileSystem()
            .getFileLinkStatus(new Path(stagingSubdirProcess));
    Assert.assertTrue(stagingSubdirFeedStatus.isDirectory());
    Assert.assertEquals(stagingSubdirFeedStatus.getPermission(), HadoopClientFactory.ALL_PERMISSION);
    Assert.assertTrue(stagingSubdirProcessStatus.isDirectory());
    Assert.assertEquals(stagingSubdirProcessStatus.getPermission(), HadoopClientFactory.ALL_PERMISSION);
}