Example usage for org.apache.hadoop.fs FileStatus isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus isDirectory.

Prototype

public boolean isDirectory()

Source Link

Document

Is this a directory?

Usage

From source file:org.apache.drill.exec.store.dfs.FileSelection.java

License:Apache License

public boolean containsDirectories(DrillFileSystem fs) throws IOException {
    if (dirStatus == StatusType.NOT_CHECKED) {
        dirStatus = StatusType.NO_DIRS;//w w  w .  j  ava  2 s.co  m
        for (final FileStatus status : getStatuses(fs)) {
            if (status.isDirectory()) {
                dirStatus = StatusType.HAS_DIRS;
                break;
            }
        }
    }
    return dirStatus == StatusType.HAS_DIRS;
}

From source file:org.apache.drill.exec.store.dfs.FileSelection.java

License:Apache License

public FileSelection minusDirectories(DrillFileSystem fs) throws IOException {
    if (isExpandedFully()) {
        return this;
    }//  ww w  .j a  v  a  2s.  co  m
    Stopwatch timer = Stopwatch.createStarted();
    final List<FileStatus> statuses = getStatuses(fs);
    final int total = statuses.size();
    final Path[] paths = new Path[total];
    for (int i = 0; i < total; i++) {
        paths[i] = statuses.get(i).getPath();
    }
    final List<FileStatus> allStats = fs.list(true, paths);
    final List<FileStatus> nonDirectories = Lists
            .newArrayList(Iterables.filter(allStats, new Predicate<FileStatus>() {
                @Override
                public boolean apply(@Nullable FileStatus status) {
                    return !status.isDirectory();
                }
            }));

    final FileSelection fileSel = create(nonDirectories, null, selectionRoot);
    logger.debug("FileSelection.minusDirectories() took {} ms, numFiles: {}",
            timer.elapsed(TimeUnit.MILLISECONDS), total);

    // fileSel will be null if we query an empty folder
    if (fileSel != null) {
        fileSel.setExpandedFully();
    }

    return fileSel;
}

From source file:org.apache.drill.exec.store.parquet.FooterGatherer.java

License:Apache License

public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism)
        throws IOException {
    final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
    List<Footer> foundFooters = Lists.newArrayList();
    for (FileStatus status : statuses) {

        if (status.isDirectory()) {
            // first we check for summary file.
            FileSystem fs = status.getPath().getFileSystem(conf);

            final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
            if (fs.exists(summaryPath)) {
                FileStatus summaryStatus = fs.getFileStatus(summaryPath);
                foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
                continue;
            }//from w ww . j  a  v  a2s .  c  om

            // else we handle as normal file.
            for (FileStatus inStatus : fs.listStatus(status.getPath(), new DrillPathFilter())) {
                readers.add(new FooterReader(conf, inStatus));
            }
        } else {
            readers.add(new FooterReader(conf, status));
        }

    }
    if (!readers.isEmpty()) {
        foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
    }

    return foundFooters;
}

From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java

License:Apache License

/**
 * Create the parquet metadata files for the directory at the given path and for any subdirectories.
 * Metadata cache files written to the disk contain relative paths. Returned Pair of metadata contains absolute paths.
 *
 * @param path to the directory of the parquet table
 * @param fs file system//from  w  ww. j  a  v a 2  s  .  com
 * @param allColumns if set, store column metadata for all the columns
 * @param columnSet Set of columns for which column metadata has to be stored
 * @return Pair of parquet metadata. The left one is a parquet metadata for the table. The right one of the Pair is
 *         a metadata for all subdirectories (if they are present and there are no any parquet files in the
 *         {@code path} directory).
 * @throws IOException if parquet metadata can't be serialized and written to the json file
 */
private Pair<ParquetTableMetadata_v3, ParquetTableMetadataDirs> createMetaFilesRecursively(final Path path,
        FileSystem fs, boolean allColumns, Set<String> columnSet) throws IOException {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    List<ParquetFileMetadata_v3> metaDataList = Lists.newArrayList();
    List<Path> directoryList = Lists.newArrayList();
    ConcurrentHashMap<ColumnTypeMetadata_v3.Key, ColumnTypeMetadata_v3> columnTypeInfoSet = new ConcurrentHashMap<>();
    Path p = path;
    FileStatus fileStatus = fs.getFileStatus(p);
    assert fileStatus.isDirectory() : "Expected directory";

    final Map<FileStatus, FileSystem> childFiles = new LinkedHashMap<>();

    for (final FileStatus file : DrillFileSystemUtil.listAll(fs, p, false)) {
        if (file.isDirectory()) {
            ParquetTableMetadata_v3 subTableMetadata = (createMetaFilesRecursively(file.getPath(), fs,
                    allColumns, columnSet)).getLeft();
            metaDataList.addAll(subTableMetadata.files);
            directoryList.addAll(subTableMetadata.directories);
            directoryList.add(file.getPath());
            // Merge the schema from the child level into the current level
            //TODO: We need a merge method that merges two columns with the same name but different types
            columnTypeInfoSet.putAll(subTableMetadata.columnTypeInfo);
        } else {
            childFiles.put(file, fs);
        }
    }
    ParquetTableMetadata_v3 parquetTableMetadata = new ParquetTableMetadata_v3(
            SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion());
    if (childFiles.size() > 0) {
        List<ParquetFileMetadata_v3> childFilesMetadata = getParquetFileMetadata_v3(parquetTableMetadata,
                childFiles, allColumns, columnSet);
        metaDataList.addAll(childFilesMetadata);
        // Note that we do not need to merge the columnInfo at this point. The columnInfo is already added
        // to the parquetTableMetadata.
    }

    parquetTableMetadata.directories = directoryList;
    parquetTableMetadata.files = metaDataList;
    // TODO: We need a merge method that merges two columns with the same name but different types
    if (parquetTableMetadata.columnTypeInfo == null) {
        parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
    }
    parquetTableMetadata.columnTypeInfo.putAll(columnTypeInfoSet);

    for (String oldName : OLD_METADATA_FILENAMES) {
        fs.delete(new Path(p, oldName), false);
    }
    //  relative paths in the metadata are only necessary for meta cache files.
    ParquetTableMetadata_v3 metadataTableWithRelativePaths = MetadataPathUtils
            .createMetadataWithRelativePaths(parquetTableMetadata, path);
    writeFile(metadataTableWithRelativePaths, new Path(p, METADATA_FILENAME), fs);

    if (directoryList.size() > 0 && childFiles.size() == 0) {
        ParquetTableMetadataDirs parquetTableMetadataDirsRelativePaths = new ParquetTableMetadataDirs(
                metadataTableWithRelativePaths.directories);
        writeFile(parquetTableMetadataDirsRelativePaths, new Path(p, METADATA_DIRECTORIES_FILENAME), fs);
        if (timer != null) {
            logger.debug("Creating metadata files recursively took {} ms",
                    timer.elapsed(TimeUnit.MILLISECONDS));
        }
        ParquetTableMetadataDirs parquetTableMetadataDirs = new ParquetTableMetadataDirs(directoryList);
        return Pair.of(parquetTableMetadata, parquetTableMetadataDirs);
    }
    List<Path> emptyDirList = new ArrayList<>();
    if (timer != null) {
        logger.debug("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
    }
    return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(emptyDirList));
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Create the parquet metadata file for the directory at the given path, and for any subdirectories
 *
 * @param path/*from   w w  w . ja va 2 s  .co m*/
 * @throws IOException
 */
private Pair<ParquetTableMetadata_v2, ParquetTableMetadataDirs> createMetaFilesRecursively(final String path)
        throws IOException {
    Stopwatch timer = Stopwatch.createStarted();
    List<ParquetFileMetadata_v2> metaDataList = Lists.newArrayList();
    List<String> directoryList = Lists.newArrayList();
    ConcurrentHashMap<ColumnTypeMetadata_v2.Key, ColumnTypeMetadata_v2> columnTypeInfoSet = new ConcurrentHashMap<>();
    Path p = new Path(path);
    FileStatus fileStatus = fs.getFileStatus(p);
    assert fileStatus.isDirectory() : "Expected directory";

    final List<FileStatus> childFiles = Lists.newArrayList();

    for (final FileStatus file : fs.listStatus(p, new DrillPathFilter())) {
        if (file.isDirectory()) {
            ParquetTableMetadata_v2 subTableMetadata = (createMetaFilesRecursively(file.getPath().toString()))
                    .getLeft();
            metaDataList.addAll(subTableMetadata.files);
            directoryList.addAll(subTableMetadata.directories);
            directoryList.add(file.getPath().toString());
            // Merge the schema from the child level into the current level
            //TODO: We need a merge method that merges two colums with the same name but different types
            columnTypeInfoSet.putAll(subTableMetadata.columnTypeInfo);
        } else {
            childFiles.add(file);
        }
    }
    ParquetTableMetadata_v2 parquetTableMetadata = new ParquetTableMetadata_v2();
    if (childFiles.size() > 0) {
        List<ParquetFileMetadata_v2> childFilesMetadata = getParquetFileMetadata_v2(parquetTableMetadata,
                childFiles);
        metaDataList.addAll(childFilesMetadata);
        // Note that we do not need to merge the columnInfo at this point. The columnInfo is already added
        // to the parquetTableMetadata.
    }

    parquetTableMetadata.directories = directoryList;
    parquetTableMetadata.files = metaDataList;
    //TODO: We need a merge method that merges two colums with the same name but different types
    if (parquetTableMetadata.columnTypeInfo == null) {
        parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
    }
    parquetTableMetadata.columnTypeInfo.putAll(columnTypeInfoSet);

    for (String oldname : OLD_METADATA_FILENAMES) {
        fs.delete(new Path(p, oldname), false);
    }
    writeFile(parquetTableMetadata, new Path(p, METADATA_FILENAME));

    if (directoryList.size() > 0 && childFiles.size() == 0) {
        ParquetTableMetadataDirs parquetTableMetadataDirs = new ParquetTableMetadataDirs(directoryList);
        writeFile(parquetTableMetadataDirs, new Path(p, METADATA_DIRECTORIES_FILENAME));
        logger.info("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
        return Pair.of(parquetTableMetadata, parquetTableMetadataDirs);
    }
    List<String> emptyDirList = Lists.newArrayList();
    logger.info("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
    timer.stop();
    return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(emptyDirList));
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Recursively get a list of files/*  w  ww . j  a  va  2  s .co  m*/
 *
 * @param fileStatus
 * @return
 * @throws IOException
 */
private List<FileStatus> getFileStatuses(FileStatus fileStatus) throws IOException {
    List<FileStatus> statuses = Lists.newArrayList();
    if (fileStatus.isDirectory()) {
        for (FileStatus child : fs.listStatus(fileStatus.getPath(), new DrillPathFilter())) {
            statuses.addAll(getFileStatuses(child));
        }
    } else {
        statuses.add(fileStatus);
    }
    return statuses;
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

/**
 * Create and return a new file selection based on reading the metadata cache file.
 *
 * This function also initializes a few of ParquetGroupScan's fields as appropriate.
 *
 * @param selection initial file selection
 * @param metaFilePath metadata cache file path
 * @return file selection read from cache
 *
 * @throws IOException/*ww w .  j  av  a 2s. c  o  m*/
 * @throws UserException when the updated selection is empty, this happens if the user selects an empty folder.
 */
private FileSelection initFromMetadataCache(FileSelection selection, Path metaFilePath) throws IOException {
    // get the metadata for the root directory by reading the metadata file
    // parquetTableMetadata contains the metadata for all files in the selection root folder, but we need to make sure
    // we only select the files that are part of selection (by setting fileSet appropriately)

    // get (and set internal field) the metadata for the directory by reading the metadata file
    this.parquetTableMetadata = Metadata.readBlockMeta(fs, metaFilePath.toString(), selection.getMetaContext());
    List<FileStatus> fileStatuses = selection.getStatuses(fs);

    if (fileSet == null) {
        fileSet = Sets.newHashSet();
    }

    final Path first = fileStatuses.get(0).getPath();
    if (fileStatuses.size() == 1 && selection.getSelectionRoot().equals(first.toString())) {
        // we are selecting all files from selection root. Expand the file list from the cache
        for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }

    } else if (selection.isExpandedPartial() && !selection.hadWildcard() && cacheFileRoot != null) {
        if (selection.wasAllPartitionsPruned()) {
            // if all partitions were previously pruned, we only need to read 1 file (for the schema)
            fileSet.add(this.parquetTableMetadata.getFiles().get(0).getPath());
        } else {
            // we are here if the selection is in the expanded_partial state (i.e it has directories).  We get the
            // list of files from the metadata cache file that is present in the cacheFileRoot directory and populate
            // the fileSet. However, this is *not* the final list of files that will be scanned in execution since the
            // second phase of partition pruning will apply on the files and modify the file selection appropriately.
            for (Metadata.ParquetFileMetadata file : this.parquetTableMetadata.getFiles()) {
                fileSet.add(file.getPath());
            }
        }
    } else {
        // we need to expand the files from fileStatuses
        for (FileStatus status : fileStatuses) {
            if (status.isDirectory()) {
                //TODO [DRILL-4496] read the metadata cache files in parallel
                final Path metaPath = new Path(status.getPath(), Metadata.METADATA_FILENAME);
                final Metadata.ParquetTableMetadataBase metadata = Metadata.readBlockMeta(fs,
                        metaPath.toString(), selection.getMetaContext());
                for (Metadata.ParquetFileMetadata file : metadata.getFiles()) {
                    fileSet.add(file.getPath());
                }
            } else {
                final Path path = Path.getPathWithoutSchemeAndAuthority(status.getPath());
                fileSet.add(path.toString());
            }
        }
    }

    if (fileSet.isEmpty()) {
        // no files were found, most likely we tried to query some empty sub folders
        throw UserException.validationError().message("The table you tried to query is empty").build(logger);
    }

    List<String> fileNames = Lists.newArrayList(fileSet);

    // when creating the file selection, set the selection root without the URI prefix
    // The reason is that the file names above have been created in the form
    // /a/b/c.parquet and the format of the selection root must match that of the file names
    // otherwise downstream operations such as partition pruning can break.
    final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(new Path(selection.getSelectionRoot()));
    this.selectionRoot = metaRootPath.toString();

    // Use the FileSelection constructor directly here instead of the FileSelection.create() method
    // because create() changes the root to include the scheme and authority; In future, if create()
    // is the preferred way to instantiate a file selection, we may need to do something different...
    // WARNING: file statuses and file names are inconsistent
    FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames,
            metaRootPath.toString(), cacheFileRoot, selection.wasAllPartitionsPruned());

    newSelection.setExpandedFully();
    newSelection.setMetaContext(selection.getMetaContext());
    return newSelection;
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

private void getFiles(String path, List<FileStatus> fileStatuses) throws IOException {
    Path p = Path.getPathWithoutSchemeAndAuthority(new Path(path));
    FileStatus fileStatus = fs.getFileStatus(p);
    if (fileStatus.isDirectory()) {
        for (FileStatus f : fs.listStatus(p, new DrillPathFilter())) {
            getFiles(f.getPath().toString(), fileStatuses);
        }//  w ww  .  java2s .c o  m
    } else {
        fileStatuses.add(fileStatus);
    }
}

From source file:org.apache.drill.exec.util.FileSystemUtil.java

License:Apache License

/**
 * Checks if file status is applicable based on file system object {@link Scope}.
 *
 * @param status file status/*ww  w.j a  v  a2s .c  o  m*/
 * @param scope file system objects scope
 * @return true if status is applicable, false otherwise
 */
private static boolean isStatusApplicable(FileStatus status, Scope scope) {
    switch (scope) {
    case DIRECTORIES:
        return status.isDirectory();
    case FILES:
        return status.isFile();
    case ALL:
        return true;
    default:
        return false;
    }
}

From source file:org.apache.falcon.entity.parser.ClusterEntityParserTest.java

License:Apache License

/**
 * A lightweight unit test for a cluster where location type working is missing.
 * It should automatically get generated
 * Extensive tests are found in ClusterEntityValidationIT.
 *//*w w w.  ja v a  2s.  c  o  m*/
@Test
public void testClusterWithOnlyStaging() throws Exception {
    ClusterEntityParser clusterEntityParser = Mockito
            .spy((ClusterEntityParser) EntityParserFactory.getParser(EntityType.CLUSTER));
    Cluster cluster = (Cluster) this.dfsCluster.getCluster().copy();
    Locations locations = getClusterLocations("staging2", null);
    cluster.setLocations(locations);
    Mockito.doNothing().when(clusterEntityParser).validateWorkflowInterface(cluster);
    Mockito.doNothing().when(clusterEntityParser).validateMessagingInterface(cluster);
    Mockito.doNothing().when(clusterEntityParser).validateRegistryInterface(cluster);
    String stagingPath = ClusterHelper.getLocation(cluster, ClusterLocationType.STAGING).getPath();
    this.dfsCluster.getFileSystem().mkdirs(new Path(stagingPath), HadoopClientFactory.ALL_PERMISSION);
    clusterEntityParser.validate(cluster);
    String workingDirPath = cluster.getLocations().getLocations().get(0).getPath() + "/working";
    Assert.assertEquals(ClusterHelper.getLocation(cluster, ClusterLocationType.WORKING).getPath(),
            workingDirPath);
    FileStatus workingDirStatus = this.dfsCluster.getFileSystem().getFileLinkStatus(new Path(workingDirPath));
    Assert.assertTrue(workingDirStatus.isDirectory());
    Assert.assertEquals(workingDirStatus.getPermission(), HadoopClientFactory.READ_EXECUTE_PERMISSION);
    Assert.assertEquals(workingDirStatus.getOwner(), UserGroupInformation.getLoginUser().getShortUserName());

    FileStatus emptyDirStatus = this.dfsCluster.getFileSystem()
            .getFileStatus(new Path(stagingPath + "/" + ClusterHelper.EMPTY_DIR_NAME));
    Assert.assertEquals(emptyDirStatus.getPermission(), HadoopClientFactory.READ_ONLY_PERMISSION);
    Assert.assertEquals(emptyDirStatus.getOwner(), UserGroupInformation.getLoginUser().getShortUserName());

    String stagingSubdirFeed = cluster.getLocations().getLocations().get(0).getPath()
            + "/falcon/workflows/feed";
    String stagingSubdirProcess = cluster.getLocations().getLocations().get(0).getPath()
            + "/falcon/workflows/process";
    FileStatus stagingSubdirFeedStatus = this.dfsCluster.getFileSystem()
            .getFileLinkStatus(new Path(stagingSubdirFeed));
    FileStatus stagingSubdirProcessStatus = this.dfsCluster.getFileSystem()
            .getFileLinkStatus(new Path(stagingSubdirProcess));
    Assert.assertTrue(stagingSubdirFeedStatus.isDirectory());
    Assert.assertEquals(stagingSubdirFeedStatus.getPermission(), HadoopClientFactory.ALL_PERMISSION);
    Assert.assertTrue(stagingSubdirProcessStatus.isDirectory());
    Assert.assertEquals(stagingSubdirProcessStatus.getPermission(), HadoopClientFactory.ALL_PERMISSION);
}