List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.cloudera.fts.spark.format.RawFileRecordReader.java
License:Apache License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); FileSplit split = (FileSplit) inputSplit; Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); fileIn = fs.open(path);/*from w ww . j a va2s . c om*/ key = new Text(path.toString()); finished = false; }
From source file:com.cloudera.hive.scd.SQLUpdater.java
License:Open Source License
private List<String> loadUpdateStatements(InputSplit split, JobConf jc) throws IOException { long currentSCDTime = asSCDTime(jc.get("scd.time", ""), System.currentTimeMillis()); List<String> stmts = Lists.newArrayList(); if (split instanceof FileSplit) { Path base = ((FileSplit) split).getPath(); FileSystem fs = base.getFileSystem(jc); Path updates = new Path(base.getParent(), ".updates"); if (fs.exists(updates)) { stmts.addAll(readLines(fs, updates, currentSCDTime)); }/*from w ww . j a v a 2 s. c o m*/ } return stmts; }
From source file:com.cloudera.impala.analysis.CreateTableLikeFileStmt.java
License:Apache License
/** * Reads the first block from the given HDFS file and returns the Parquet schema. * Throws Analysis exception for any failure, such as failing to read the file * or failing to parse the contents.//from w w w.j a v a 2 s. c om */ private static parquet.schema.MessageType loadParquetSchema(Path pathToFile) throws AnalysisException { try { FileSystem fs = pathToFile.getFileSystem(FileSystemUtil.getConfiguration()); if (!fs.isFile(pathToFile)) { throw new AnalysisException("Cannot infer schema, path is not a file: " + pathToFile); } } catch (IOException e) { throw new AnalysisException("Failed to connect to HDFS:" + e); } ParquetMetadata readFooter = null; try { readFooter = ParquetFileReader.readFooter(FileSystemUtil.getConfiguration(), pathToFile); } catch (FileNotFoundException e) { throw new AnalysisException("File not found: " + e); } catch (IOException e) { throw new AnalysisException("Failed to open HDFS file as a parquet file: " + e); } catch (RuntimeException e) { // Parquet throws a generic RuntimeException when reading a non-parquet file if (e.toString().contains("is not a Parquet file")) { throw new AnalysisException("File is not a parquet file: " + pathToFile); } // otherwise, who knows what we caught, throw it back up throw e; } return readFooter.getFileMetaData().getSchema(); }
From source file:com.cloudera.impala.analysis.LoadDataStmt.java
License:Apache License
private void analyzePaths(Analyzer analyzer, HdfsTable hdfsTable) throws AnalysisException { // The user must have permission to access the source location. Since the files will // be moved from this location, the user needs to have all permission. sourceDataPath_.analyze(analyzer, Privilege.ALL); try {//from ww w.j a v a 2 s.c om Path source = sourceDataPath_.getPath(); FileSystem fs = source.getFileSystem(FileSystemUtil.getConfiguration()); // sourceDataPath_.analyze() ensured that path is on an HDFS filesystem. Preconditions.checkState(fs instanceof DistributedFileSystem); DistributedFileSystem dfs = (DistributedFileSystem) fs; if (!dfs.exists(source)) { throw new AnalysisException(String.format("INPATH location '%s' does not exist.", sourceDataPath_)); } if (dfs.isDirectory(source)) { if (FileSystemUtil.getTotalNumVisibleFiles(source) == 0) { throw new AnalysisException( String.format("INPATH location '%s' contains no visible files.", sourceDataPath_)); } if (FileSystemUtil.containsSubdirectory(source)) { throw new AnalysisException( String.format("INPATH location '%s' cannot contain subdirectories.", sourceDataPath_)); } } else { // INPATH points to a file. if (FileSystemUtil.isHiddenFile(source.getName())) { throw new AnalysisException( String.format("INPATH location '%s' points to a hidden file.", source)); } } String noWriteAccessErrorMsg = String.format( "Unable to LOAD DATA into " + "target table (%s) because Impala does not have WRITE access to HDFS " + "location: ", hdfsTable.getFullName()); HdfsPartition partition; String location; if (partitionSpec_ != null) { partition = hdfsTable.getPartition(partitionSpec_.getPartitionSpecKeyValues()); location = partition.getLocation(); if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) { throw new AnalysisException(noWriteAccessErrorMsg + partition.getLocation()); } } else { // "default" partition partition = hdfsTable.getPartitions().get(0); location = hdfsTable.getLocation(); if (!hdfsTable.hasWriteAccess()) { throw new AnalysisException(noWriteAccessErrorMsg + hdfsTable.getLocation()); } } Preconditions.checkNotNull(partition); // Until Frontend.loadTableData() can handle cross-filesystem and filesystems // that aren't HDFS, require that source and dest are on the same HDFS. if (!FileSystemUtil.isPathOnFileSystem(new Path(location), fs)) { throw new AnalysisException(String.format( "Unable to LOAD DATA into target table (%s) because source path (%s) and " + "destination %s (%s) are on different file-systems.", hdfsTable.getFullName(), source, partitionSpec_ == null ? "table" : "partition", partition.getLocation())); } // Verify the files being loaded are supported. for (FileStatus fStatus : fs.listStatus(source)) { if (fs.isDirectory(fStatus.getPath())) continue; StringBuilder errorMsg = new StringBuilder(); HdfsFileFormat fileFormat = partition.getInputFormatDescriptor().getFileFormat(); if (!fileFormat.isFileCompressionTypeSupported(fStatus.getPath().toString(), errorMsg)) { throw new AnalysisException(errorMsg.toString()); } } } catch (FileNotFoundException e) { throw new AnalysisException("File not found: " + e.getMessage(), e); } catch (IOException e) { throw new AnalysisException("Error accessing file system: " + e.getMessage(), e); } }
From source file:com.cloudera.impala.catalog.HBaseTable.java
License:Apache License
/** * Returns the Hdfs size of the given region in bytes. NULL can be * passed as a parameter to retrieve the size of the complete table. */// w w w .ja v a2s. c o m public long getHdfsSize(HRegionInfo info) throws IOException { Path tableDir = HTableDescriptor.getTableDir(FSUtils.getRootDir(hbaseConf_), Bytes.toBytes(hbaseTableName_)); FileSystem fs = tableDir.getFileSystem(hbaseConf_); if (info != null) { Path regionDir = tableDir.suffix("/" + info.getEncodedName()); return fs.getContentSummary(regionDir).getLength(); } else { return fs.getContentSummary(tableDir).getLength(); } }
From source file:com.cloudera.impala.catalog.HdfsTable.java
License:Apache License
/** * Create HdfsPartition objects corresponding to 'partitions'. * * If there are no partitions in the Hive metadata, a single partition is added with no * partition keys./*from w w w. j a v a 2 s . c om*/ * * For files that have not been changed, reuses file descriptors from oldFileDescMap. * * TODO: If any partition fails to load, the entire table will fail to load. Instead, * we should consider skipping partitions that cannot be loaded and raise a warning * whenever the table is accessed. */ private void loadPartitions(List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions, org.apache.hadoop.hive.metastore.api.Table msTbl, Map<String, List<FileDescriptor>> oldFileDescMap) throws IOException, CatalogException { resetPartitionMd(); partitions_.clear(); hdfsBaseDir_ = msTbl.getSd().getLocation(); // Map of filesystem to parent path to a list of new/modified // FileDescriptors. FileDescriptors in this Map will have their block location // information (re)loaded. This is used to speed up the incremental refresh of a // table's metadata by skipping unmodified, previously loaded FileDescriptors. Map<FsKey, Map<String, List<FileDescriptor>>> fileDescsToLoad = Maps.newHashMap(); // INSERT statements need to refer to this if they try to write to new partitions // Scans don't refer to this because by definition all partitions they refer to // exist. addDefaultPartition(msTbl.getSd()); Long cacheDirectiveId = HdfsCachingUtil.getCacheDirIdFromParams(msTbl.getParameters()); isMarkedCached_ = cacheDirectiveId != null; if (msTbl.getPartitionKeysSize() == 0) { Preconditions.checkArgument(msPartitions == null || msPartitions.isEmpty()); // This table has no partition key, which means it has no declared partitions. // We model partitions slightly differently to Hive - every file must exist in a // partition, so add a single partition with no keys which will get all the // files in the table's root directory. HdfsPartition part = createPartition(msTbl.getSd(), null, oldFileDescMap, fileDescsToLoad); addPartition(part); if (isMarkedCached_) part.markCached(); Path location = new Path(hdfsBaseDir_); FileSystem fs = location.getFileSystem(CONF); if (fs.exists(location)) { accessLevel_ = getAvailableAccessLevel(fs, location); } } else { for (org.apache.hadoop.hive.metastore.api.Partition msPartition : msPartitions) { HdfsPartition partition = createPartition(msPartition.getSd(), msPartition, oldFileDescMap, fileDescsToLoad); addPartition(partition); // If the partition is null, its HDFS path does not exist, and it was not added to // this table's partition list. Skip the partition. if (partition == null) continue; if (msPartition.getParameters() != null) ; { partition.setNumRows(getRowCount(msPartition.getParameters())); } if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) { // TODO: READ_ONLY isn't exactly correct because the it's possible the // partition does not have READ permissions either. When we start checking // whether we can READ from a table, this should be updated to set the // table's access level to the "lowest" effective level across all // partitions. That is, if one partition has READ_ONLY and another has // WRITE_ONLY the table's access level should be NONE. accessLevel_ = TAccessLevel.READ_ONLY; } } } loadBlockMd(fileDescsToLoad); }
From source file:com.cloudera.impala.catalog.HdfsTable.java
License:Apache License
/** * Creates a new HdfsPartition object to be added to the internal partition list. * Populates with file format information and file locations. Partitions may be empty, * or may not even exist on the file system (a partition's location may have been * changed to a new path that is about to be created by an INSERT). For unchanged * files (indicated by unchanged mtime), reuses the FileDescriptor from the * oldFileDescMap. The one exception is if the partition is marked as cached * in which case the block metadata cannot be reused. Otherwise, creates a new * FileDescriptor for each modified or new file and adds it to newFileDescMap. * Both old and newFileDescMap are Maps of parent directory (partition location) * to list of files (FileDescriptors) under that directory. * Returns new partition if successful or null if none was added. * Separated from addPartition to reduce the number of operations done * while holding the lock on the hdfs table. // w w w .j a v a 2 s . c o m * @throws CatalogException * if the supplied storage descriptor contains metadata that Impala can't * understand. */ private HdfsPartition createPartition(StorageDescriptor storageDescriptor, org.apache.hadoop.hive.metastore.api.Partition msPartition, Map<String, List<FileDescriptor>> oldFileDescMap, Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException { HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor); Path partDirPath = new Path(storageDescriptor.getLocation()); List<FileDescriptor> fileDescriptors = Lists.newArrayList(); // If the partition is marked as cached, the block location metadata must be // reloaded, even if the file times have not changed. boolean isMarkedCached = isMarkedCached_; List<LiteralExpr> keyValues = Lists.newArrayList(); if (msPartition != null) { isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null; // Load key values for (String partitionKey : msPartition.getValues()) { Type type = getColumns().get(keyValues.size()).getType(); // Deal with Hive's special NULL partition key. if (partitionKey.equals(nullPartitionKeyValue_)) { keyValues.add(NullLiteral.create(type)); } else { try { keyValues.add(LiteralExpr.create(partitionKey, type)); } catch (Exception ex) { LOG.warn("Failed to create literal expression of type: " + type, ex); throw new CatalogException("Invalid partition key value of type: " + type, ex); } } } try { Expr.analyze(keyValues, null); } catch (AnalysisException e) { // should never happen throw new IllegalStateException(e); } } try { // Each partition could reside on a different filesystem. FileSystem fs = partDirPath.getFileSystem(CONF); multipleFileSystems_ = multipleFileSystems_ || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs); if (fs.exists(partDirPath)) { // FileSystem does not have an API that takes in a timestamp and returns a list // of files that has been added/changed since. Therefore, we are calling // fs.listStatus() to list all the files. for (FileStatus fileStatus : fs.listStatus(partDirPath)) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. continue; } String partitionDir = fileStatus.getPath().getParent().toString(); FileDescriptor fd = null; // Search for a FileDescriptor with the same partition dir and file name. If one // is found, it will be chosen as a candidate to reuse. if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) { for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) { if (oldFileDesc.getFileName().equals(fileName)) { fd = oldFileDesc; break; } } } // Check if this FileDescriptor has been modified since last loading its block // location information. If it has not been changed, the previously loaded // value can be reused. if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen() || fd.getModificationTime() != fileStatus.getModificationTime()) { // Create a new file descriptor, the block metadata will be populated by // loadBlockMd. fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime()); addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd); } List<FileDescriptor> fds = fileDescMap_.get(partitionDir); if (fds == null) { fds = Lists.newArrayList(); fileDescMap_.put(partitionDir, fds); } fds.add(fd); // Add to the list of FileDescriptors for this partition. fileDescriptors.add(fd); } numHdfsFiles_ += fileDescriptors.size(); } HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor, fileDescriptors, getAvailableAccessLevel(fs, partDirPath)); partition.checkWellFormed(); return partition; } catch (Exception e) { throw new CatalogException("Failed to create partition: ", e); } }
From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java
License:Apache License
/** * List file status by calling fileSystem.listStatus. *//*from w w w.j a v a 2 s .co m*/ private static void listStatus(String dirPath) { Path path = new Path(dirPath); boolean exceptionThrown = false; try { FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf()); FileStatus[] fileStatus = fs.listStatus(path); if (fs.exists(path)) { for (FileStatus status : fileStatus) { BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation loc : locations) { loc.getNames(); loc.getHosts(); } } } } catch (IOException e) { exceptionThrown = true; LOG.error("Failed to list Status", e); } assertFalse(exceptionThrown); }
From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java
License:Apache License
/** * List file status by calling fileSystem.listLocatedStatus. */// w ww . j av a2 s . c o m private static void listLocatedStatus(String dirPath) { Path path = new Path(dirPath); boolean exceptionThrown = false; try { FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf()); RemoteIterator<LocatedFileStatus> iterator = fs.listLocatedStatus(path); if (fs.exists(path)) { while (iterator.hasNext()) { LocatedFileStatus fileStatus = iterator.next(); BlockLocation[] locations = fileStatus.getBlockLocations(); for (BlockLocation loc : locations) { loc.getHosts(); loc.getNames(); } } } } catch (IOException e) { exceptionThrown = true; LOG.error("Failed to list Located Status", e); } assertFalse(exceptionThrown); }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Performs a non-recursive delete of all visible (non-hidden) files in a given * directory. Returns the number of files deleted as part of this operation. *//*from ww w . j av a2s . c om*/ public static int deleteAllVisibleFiles(Path directory) throws IOException { FileSystem fs = directory.getFileSystem(CONF); Preconditions.checkState(fs.getFileStatus(directory).isDirectory()); int numFilesDeleted = 0; for (FileStatus fStatus : fs.listStatus(directory)) { // Only delete files that are not hidden. if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) { LOG.debug("Removing: " + fStatus.getPath()); fs.delete(fStatus.getPath(), false); ++numFilesDeleted; } } return numFilesDeleted; }