Example usage for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.cloudera.fts.spark.format.RawFileRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    FileSplit split = (FileSplit) inputSplit;
    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);
    fileIn = fs.open(path);/*from w  ww .  j  a va2s . c  om*/
    key = new Text(path.toString());
    finished = false;
}

From source file:com.cloudera.hive.scd.SQLUpdater.java

License:Open Source License

private List<String> loadUpdateStatements(InputSplit split, JobConf jc) throws IOException {
    long currentSCDTime = asSCDTime(jc.get("scd.time", ""), System.currentTimeMillis());
    List<String> stmts = Lists.newArrayList();
    if (split instanceof FileSplit) {
        Path base = ((FileSplit) split).getPath();
        FileSystem fs = base.getFileSystem(jc);
        Path updates = new Path(base.getParent(), ".updates");
        if (fs.exists(updates)) {
            stmts.addAll(readLines(fs, updates, currentSCDTime));
        }/*from  w ww  .  j a v a 2 s.  c o  m*/
    }
    return stmts;
}

From source file:com.cloudera.impala.analysis.CreateTableLikeFileStmt.java

License:Apache License

/**
 * Reads the first block from the given HDFS file and returns the Parquet schema.
 * Throws Analysis exception for any failure, such as failing to read the file
 * or failing to parse the contents.//from w w  w.j  a  v  a  2 s. c om
 */
private static parquet.schema.MessageType loadParquetSchema(Path pathToFile) throws AnalysisException {
    try {
        FileSystem fs = pathToFile.getFileSystem(FileSystemUtil.getConfiguration());
        if (!fs.isFile(pathToFile)) {
            throw new AnalysisException("Cannot infer schema, path is not a file: " + pathToFile);
        }
    } catch (IOException e) {
        throw new AnalysisException("Failed to connect to HDFS:" + e);
    }
    ParquetMetadata readFooter = null;
    try {
        readFooter = ParquetFileReader.readFooter(FileSystemUtil.getConfiguration(), pathToFile);
    } catch (FileNotFoundException e) {
        throw new AnalysisException("File not found: " + e);
    } catch (IOException e) {
        throw new AnalysisException("Failed to open HDFS file as a parquet file: " + e);
    } catch (RuntimeException e) {
        // Parquet throws a generic RuntimeException when reading a non-parquet file
        if (e.toString().contains("is not a Parquet file")) {
            throw new AnalysisException("File is not a parquet file: " + pathToFile);
        }
        // otherwise, who knows what we caught, throw it back up
        throw e;
    }
    return readFooter.getFileMetaData().getSchema();
}

From source file:com.cloudera.impala.analysis.LoadDataStmt.java

License:Apache License

private void analyzePaths(Analyzer analyzer, HdfsTable hdfsTable) throws AnalysisException {
    // The user must have permission to access the source location. Since the files will
    // be moved from this location, the user needs to have all permission.
    sourceDataPath_.analyze(analyzer, Privilege.ALL);

    try {//from   ww  w.j  a v a 2 s.c om
        Path source = sourceDataPath_.getPath();
        FileSystem fs = source.getFileSystem(FileSystemUtil.getConfiguration());
        // sourceDataPath_.analyze() ensured that path is on an HDFS filesystem.
        Preconditions.checkState(fs instanceof DistributedFileSystem);
        DistributedFileSystem dfs = (DistributedFileSystem) fs;
        if (!dfs.exists(source)) {
            throw new AnalysisException(String.format("INPATH location '%s' does not exist.", sourceDataPath_));
        }

        if (dfs.isDirectory(source)) {
            if (FileSystemUtil.getTotalNumVisibleFiles(source) == 0) {
                throw new AnalysisException(
                        String.format("INPATH location '%s' contains no visible files.", sourceDataPath_));
            }
            if (FileSystemUtil.containsSubdirectory(source)) {
                throw new AnalysisException(
                        String.format("INPATH location '%s' cannot contain subdirectories.", sourceDataPath_));
            }
        } else { // INPATH points to a file.
            if (FileSystemUtil.isHiddenFile(source.getName())) {
                throw new AnalysisException(
                        String.format("INPATH location '%s' points to a hidden file.", source));
            }
        }

        String noWriteAccessErrorMsg = String.format(
                "Unable to LOAD DATA into "
                        + "target table (%s) because Impala does not have WRITE access to HDFS " + "location: ",
                hdfsTable.getFullName());

        HdfsPartition partition;
        String location;
        if (partitionSpec_ != null) {
            partition = hdfsTable.getPartition(partitionSpec_.getPartitionSpecKeyValues());
            location = partition.getLocation();
            if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
                throw new AnalysisException(noWriteAccessErrorMsg + partition.getLocation());
            }
        } else {
            // "default" partition
            partition = hdfsTable.getPartitions().get(0);
            location = hdfsTable.getLocation();
            if (!hdfsTable.hasWriteAccess()) {
                throw new AnalysisException(noWriteAccessErrorMsg + hdfsTable.getLocation());
            }
        }
        Preconditions.checkNotNull(partition);

        // Until Frontend.loadTableData() can handle cross-filesystem and filesystems
        // that aren't HDFS, require that source and dest are on the same HDFS.
        if (!FileSystemUtil.isPathOnFileSystem(new Path(location), fs)) {
            throw new AnalysisException(String.format(
                    "Unable to LOAD DATA into target table (%s) because source path (%s) and "
                            + "destination %s (%s) are on different file-systems.",
                    hdfsTable.getFullName(), source, partitionSpec_ == null ? "table" : "partition",
                    partition.getLocation()));
        }
        // Verify the files being loaded are supported.
        for (FileStatus fStatus : fs.listStatus(source)) {
            if (fs.isDirectory(fStatus.getPath()))
                continue;
            StringBuilder errorMsg = new StringBuilder();
            HdfsFileFormat fileFormat = partition.getInputFormatDescriptor().getFileFormat();
            if (!fileFormat.isFileCompressionTypeSupported(fStatus.getPath().toString(), errorMsg)) {
                throw new AnalysisException(errorMsg.toString());
            }
        }
    } catch (FileNotFoundException e) {
        throw new AnalysisException("File not found: " + e.getMessage(), e);
    } catch (IOException e) {
        throw new AnalysisException("Error accessing file system: " + e.getMessage(), e);
    }
}

From source file:com.cloudera.impala.catalog.HBaseTable.java

License:Apache License

/**
 * Returns the Hdfs size of the given region in bytes. NULL can be
 * passed as a parameter to retrieve the size of the complete table.
 */// w w  w .ja  v  a2s.  c o  m
public long getHdfsSize(HRegionInfo info) throws IOException {
    Path tableDir = HTableDescriptor.getTableDir(FSUtils.getRootDir(hbaseConf_),
            Bytes.toBytes(hbaseTableName_));
    FileSystem fs = tableDir.getFileSystem(hbaseConf_);
    if (info != null) {
        Path regionDir = tableDir.suffix("/" + info.getEncodedName());
        return fs.getContentSummary(regionDir).getLength();
    } else {
        return fs.getContentSummary(tableDir).getLength();
    }
}

From source file:com.cloudera.impala.catalog.HdfsTable.java

License:Apache License

/**
 * Create HdfsPartition objects corresponding to 'partitions'.
 *
 * If there are no partitions in the Hive metadata, a single partition is added with no
 * partition keys./*from  w  w w. j a  v  a 2  s .  c  om*/
 *
 * For files that have not been changed, reuses file descriptors from oldFileDescMap.
 *
 * TODO: If any partition fails to load, the entire table will fail to load. Instead,
 * we should consider skipping partitions that cannot be loaded and raise a warning
 * whenever the table is accessed.
 */
private void loadPartitions(List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions,
        org.apache.hadoop.hive.metastore.api.Table msTbl, Map<String, List<FileDescriptor>> oldFileDescMap)
        throws IOException, CatalogException {
    resetPartitionMd();
    partitions_.clear();
    hdfsBaseDir_ = msTbl.getSd().getLocation();

    // Map of filesystem to parent path to a list of new/modified
    // FileDescriptors. FileDescriptors in this Map will have their block location
    // information (re)loaded. This is used to speed up the incremental refresh of a
    // table's metadata by skipping unmodified, previously loaded FileDescriptors.
    Map<FsKey, Map<String, List<FileDescriptor>>> fileDescsToLoad = Maps.newHashMap();

    // INSERT statements need to refer to this if they try to write to new partitions
    // Scans don't refer to this because by definition all partitions they refer to
    // exist.
    addDefaultPartition(msTbl.getSd());
    Long cacheDirectiveId = HdfsCachingUtil.getCacheDirIdFromParams(msTbl.getParameters());
    isMarkedCached_ = cacheDirectiveId != null;

    if (msTbl.getPartitionKeysSize() == 0) {
        Preconditions.checkArgument(msPartitions == null || msPartitions.isEmpty());
        // This table has no partition key, which means it has no declared partitions.
        // We model partitions slightly differently to Hive - every file must exist in a
        // partition, so add a single partition with no keys which will get all the
        // files in the table's root directory.
        HdfsPartition part = createPartition(msTbl.getSd(), null, oldFileDescMap, fileDescsToLoad);
        addPartition(part);
        if (isMarkedCached_)
            part.markCached();
        Path location = new Path(hdfsBaseDir_);
        FileSystem fs = location.getFileSystem(CONF);
        if (fs.exists(location)) {
            accessLevel_ = getAvailableAccessLevel(fs, location);
        }
    } else {
        for (org.apache.hadoop.hive.metastore.api.Partition msPartition : msPartitions) {
            HdfsPartition partition = createPartition(msPartition.getSd(), msPartition, oldFileDescMap,
                    fileDescsToLoad);
            addPartition(partition);
            // If the partition is null, its HDFS path does not exist, and it was not added to
            // this table's partition list. Skip the partition.
            if (partition == null)
                continue;
            if (msPartition.getParameters() != null)
                ;
            {
                partition.setNumRows(getRowCount(msPartition.getParameters()));
            }
            if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) {
                // TODO: READ_ONLY isn't exactly correct because the it's possible the
                // partition does not have READ permissions either. When we start checking
                // whether we can READ from a table, this should be updated to set the
                // table's access level to the "lowest" effective level across all
                // partitions. That is, if one partition has READ_ONLY and another has
                // WRITE_ONLY the table's access level should be NONE.
                accessLevel_ = TAccessLevel.READ_ONLY;
            }
        }
    }
    loadBlockMd(fileDescsToLoad);
}

From source file:com.cloudera.impala.catalog.HdfsTable.java

License:Apache License

/**
 * Creates a new HdfsPartition object to be added to the internal partition list.
 * Populates with file format information and file locations. Partitions may be empty,
 * or may not even exist on the file system (a partition's location may have been
 * changed to a new path that is about to be created by an INSERT). For unchanged
 * files (indicated by unchanged mtime), reuses the FileDescriptor from the
 * oldFileDescMap. The one exception is if the partition is marked as cached
 * in which case the block metadata cannot be reused. Otherwise, creates a new
 * FileDescriptor for each modified or new file and adds it to newFileDescMap.
 * Both old and newFileDescMap are Maps of parent directory (partition location)
 * to list of files (FileDescriptors) under that directory.
 * Returns new partition if successful or null if none was added.
 * Separated from addPartition to reduce the number of operations done
 * while holding the lock on the hdfs table.
        //  w  w w  .j  a  v a 2  s . c o m
 *  @throws CatalogException
 *    if the supplied storage descriptor contains metadata that Impala can't
 *    understand.
 */
private HdfsPartition createPartition(StorageDescriptor storageDescriptor,
        org.apache.hadoop.hive.metastore.api.Partition msPartition,
        Map<String, List<FileDescriptor>> oldFileDescMap,
        Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException {
    HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_,
            storageDescriptor);
    Path partDirPath = new Path(storageDescriptor.getLocation());
    List<FileDescriptor> fileDescriptors = Lists.newArrayList();
    // If the partition is marked as cached, the block location metadata must be
    // reloaded, even if the file times have not changed.
    boolean isMarkedCached = isMarkedCached_;
    List<LiteralExpr> keyValues = Lists.newArrayList();
    if (msPartition != null) {
        isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null;
        // Load key values
        for (String partitionKey : msPartition.getValues()) {
            Type type = getColumns().get(keyValues.size()).getType();
            // Deal with Hive's special NULL partition key.
            if (partitionKey.equals(nullPartitionKeyValue_)) {
                keyValues.add(NullLiteral.create(type));
            } else {
                try {
                    keyValues.add(LiteralExpr.create(partitionKey, type));
                } catch (Exception ex) {
                    LOG.warn("Failed to create literal expression of type: " + type, ex);
                    throw new CatalogException("Invalid partition key value of type: " + type, ex);
                }
            }
        }
        try {
            Expr.analyze(keyValues, null);
        } catch (AnalysisException e) {
            // should never happen
            throw new IllegalStateException(e);
        }
    }
    try {
        // Each partition could reside on a different filesystem.
        FileSystem fs = partDirPath.getFileSystem(CONF);
        multipleFileSystems_ = multipleFileSystems_
                || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs);
        if (fs.exists(partDirPath)) {
            // FileSystem does not have an API that takes in a timestamp and returns a list
            // of files that has been added/changed since. Therefore, we are calling
            // fs.listStatus() to list all the files.
            for (FileStatus fileStatus : fs.listStatus(partDirPath)) {
                String fileName = fileStatus.getPath().getName().toString();
                if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName)
                        || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) {
                    // Ignore directory, hidden file starting with . or _, and LZO index files
                    // If a directory is erroneously created as a subdirectory of a partition dir
                    // we should ignore it and move on. Hive will not recurse into directories.
                    // Skip index files, these are read by the LZO scanner directly.
                    continue;
                }

                String partitionDir = fileStatus.getPath().getParent().toString();
                FileDescriptor fd = null;
                // Search for a FileDescriptor with the same partition dir and file name. If one
                // is found, it will be chosen as a candidate to reuse.
                if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) {
                    for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) {
                        if (oldFileDesc.getFileName().equals(fileName)) {
                            fd = oldFileDesc;
                            break;
                        }
                    }
                }

                // Check if this FileDescriptor has been modified since last loading its block
                // location information. If it has not been changed, the previously loaded
                // value can be reused.
                if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen()
                        || fd.getModificationTime() != fileStatus.getModificationTime()) {
                    // Create a new file descriptor, the block metadata will be populated by
                    // loadBlockMd.
                    fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime());
                    addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd);
                }

                List<FileDescriptor> fds = fileDescMap_.get(partitionDir);
                if (fds == null) {
                    fds = Lists.newArrayList();
                    fileDescMap_.put(partitionDir, fds);
                }
                fds.add(fd);

                // Add to the list of FileDescriptors for this partition.
                fileDescriptors.add(fd);
            }
            numHdfsFiles_ += fileDescriptors.size();
        }
        HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor,
                fileDescriptors, getAvailableAccessLevel(fs, partDirPath));
        partition.checkWellFormed();
        return partition;
    } catch (Exception e) {
        throw new CatalogException("Failed to create partition: ", e);
    }
}

From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java

License:Apache License

/**
 * List file status by calling fileSystem.listStatus.
 *//*from  w  w  w.j  a  v a 2  s  .co  m*/
private static void listStatus(String dirPath) {
    Path path = new Path(dirPath);
    boolean exceptionThrown = false;
    try {
        FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf());
        FileStatus[] fileStatus = fs.listStatus(path);
        if (fs.exists(path)) {
            for (FileStatus status : fileStatus) {
                BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen());
                for (BlockLocation loc : locations) {
                    loc.getNames();
                    loc.getHosts();
                }
            }
        }
    } catch (IOException e) {
        exceptionThrown = true;
        LOG.error("Failed to list Status", e);
    }
    assertFalse(exceptionThrown);
}

From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java

License:Apache License

/**
 * List file status by calling fileSystem.listLocatedStatus.
 */// w  ww .  j  av a2 s . c  o m
private static void listLocatedStatus(String dirPath) {
    Path path = new Path(dirPath);
    boolean exceptionThrown = false;
    try {
        FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf());
        RemoteIterator<LocatedFileStatus> iterator = fs.listLocatedStatus(path);
        if (fs.exists(path)) {
            while (iterator.hasNext()) {
                LocatedFileStatus fileStatus = iterator.next();
                BlockLocation[] locations = fileStatus.getBlockLocations();
                for (BlockLocation loc : locations) {
                    loc.getHosts();
                    loc.getNames();
                }
            }
        }
    } catch (IOException e) {
        exceptionThrown = true;
        LOG.error("Failed to list Located Status", e);
    }
    assertFalse(exceptionThrown);
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Performs a non-recursive delete of all visible (non-hidden) files in a given
 * directory. Returns the number of files deleted as part of this operation.
 *//*from   ww w . j  av  a2s .  c  om*/
public static int deleteAllVisibleFiles(Path directory) throws IOException {
    FileSystem fs = directory.getFileSystem(CONF);
    Preconditions.checkState(fs.getFileStatus(directory).isDirectory());
    int numFilesDeleted = 0;
    for (FileStatus fStatus : fs.listStatus(directory)) {
        // Only delete files that are not hidden.
        if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) {
            LOG.debug("Removing: " + fStatus.getPath());
            fs.delete(fStatus.getPath(), false);
            ++numFilesDeleted;
        }
    }
    return numFilesDeleted;
}