List of usage examples for org.apache.hadoop.fs FileStatus getPath
public Path getPath()
From source file:com.cloudera.hoop.fs.FSUtils.java
License:Open Source License
/** * Converts a Hadoop <code>FileStatus</code> object into a JSON array * object. It replaces the <code>SCHEME://HOST:PORT</code> of the path * with the specified URL.//ww w . j a v a 2s.co m * <p/> * @param status Hadoop file status. * @param hoopBaseUrl base URL to replace the * <code>SCHEME://HOST:PORT</code> in the file status. * @return The JSON representation of the file status. */ @SuppressWarnings("unchecked") public static Map fileStatusToJSON(FileStatus status, String hoopBaseUrl) { Map json = new LinkedHashMap(); json.put("path", convertPathToHoop(status.getPath(), hoopBaseUrl).toString()); json.put("isDir", status.isDir()); json.put("len", status.getLen()); json.put("owner", status.getOwner()); json.put("group", status.getGroup()); json.put("permission", permissionToString(status.getPermission())); json.put("accessTime", status.getAccessTime()); json.put("modificationTime", status.getModificationTime()); json.put("blockSize", status.getBlockSize()); json.put("replication", status.getReplication()); return json; }
From source file:com.cloudera.impala.analysis.LoadDataStmt.java
License:Apache License
private void analyzePaths(Analyzer analyzer, HdfsTable hdfsTable) throws AnalysisException { // The user must have permission to access the source location. Since the files will // be moved from this location, the user needs to have all permission. sourceDataPath_.analyze(analyzer, Privilege.ALL); try {//w ww .ja v a 2 s . c o m Path source = sourceDataPath_.getPath(); FileSystem fs = source.getFileSystem(FileSystemUtil.getConfiguration()); // sourceDataPath_.analyze() ensured that path is on an HDFS filesystem. Preconditions.checkState(fs instanceof DistributedFileSystem); DistributedFileSystem dfs = (DistributedFileSystem) fs; if (!dfs.exists(source)) { throw new AnalysisException(String.format("INPATH location '%s' does not exist.", sourceDataPath_)); } if (dfs.isDirectory(source)) { if (FileSystemUtil.getTotalNumVisibleFiles(source) == 0) { throw new AnalysisException( String.format("INPATH location '%s' contains no visible files.", sourceDataPath_)); } if (FileSystemUtil.containsSubdirectory(source)) { throw new AnalysisException( String.format("INPATH location '%s' cannot contain subdirectories.", sourceDataPath_)); } } else { // INPATH points to a file. if (FileSystemUtil.isHiddenFile(source.getName())) { throw new AnalysisException( String.format("INPATH location '%s' points to a hidden file.", source)); } } String noWriteAccessErrorMsg = String.format( "Unable to LOAD DATA into " + "target table (%s) because Impala does not have WRITE access to HDFS " + "location: ", hdfsTable.getFullName()); HdfsPartition partition; String location; if (partitionSpec_ != null) { partition = hdfsTable.getPartition(partitionSpec_.getPartitionSpecKeyValues()); location = partition.getLocation(); if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) { throw new AnalysisException(noWriteAccessErrorMsg + partition.getLocation()); } } else { // "default" partition partition = hdfsTable.getPartitions().get(0); location = hdfsTable.getLocation(); if (!hdfsTable.hasWriteAccess()) { throw new AnalysisException(noWriteAccessErrorMsg + hdfsTable.getLocation()); } } Preconditions.checkNotNull(partition); // Until Frontend.loadTableData() can handle cross-filesystem and filesystems // that aren't HDFS, require that source and dest are on the same HDFS. if (!FileSystemUtil.isPathOnFileSystem(new Path(location), fs)) { throw new AnalysisException(String.format( "Unable to LOAD DATA into target table (%s) because source path (%s) and " + "destination %s (%s) are on different file-systems.", hdfsTable.getFullName(), source, partitionSpec_ == null ? "table" : "partition", partition.getLocation())); } // Verify the files being loaded are supported. for (FileStatus fStatus : fs.listStatus(source)) { if (fs.isDirectory(fStatus.getPath())) continue; StringBuilder errorMsg = new StringBuilder(); HdfsFileFormat fileFormat = partition.getInputFormatDescriptor().getFileFormat(); if (!fileFormat.isFileCompressionTypeSupported(fStatus.getPath().toString(), errorMsg)) { throw new AnalysisException(errorMsg.toString()); } } } catch (FileNotFoundException e) { throw new AnalysisException("File not found: " + e.getMessage(), e); } catch (IOException e) { throw new AnalysisException("Error accessing file system: " + e.getMessage(), e); } }
From source file:com.cloudera.impala.catalog.HdfsTable.java
License:Apache License
/** * Creates a new HdfsPartition object to be added to the internal partition list. * Populates with file format information and file locations. Partitions may be empty, * or may not even exist on the file system (a partition's location may have been * changed to a new path that is about to be created by an INSERT). For unchanged * files (indicated by unchanged mtime), reuses the FileDescriptor from the * oldFileDescMap. The one exception is if the partition is marked as cached * in which case the block metadata cannot be reused. Otherwise, creates a new * FileDescriptor for each modified or new file and adds it to newFileDescMap. * Both old and newFileDescMap are Maps of parent directory (partition location) * to list of files (FileDescriptors) under that directory. * Returns new partition if successful or null if none was added. * Separated from addPartition to reduce the number of operations done * while holding the lock on the hdfs table. //from ww w .ja v a 2s .c om * @throws CatalogException * if the supplied storage descriptor contains metadata that Impala can't * understand. */ private HdfsPartition createPartition(StorageDescriptor storageDescriptor, org.apache.hadoop.hive.metastore.api.Partition msPartition, Map<String, List<FileDescriptor>> oldFileDescMap, Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException { HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor); Path partDirPath = new Path(storageDescriptor.getLocation()); List<FileDescriptor> fileDescriptors = Lists.newArrayList(); // If the partition is marked as cached, the block location metadata must be // reloaded, even if the file times have not changed. boolean isMarkedCached = isMarkedCached_; List<LiteralExpr> keyValues = Lists.newArrayList(); if (msPartition != null) { isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null; // Load key values for (String partitionKey : msPartition.getValues()) { Type type = getColumns().get(keyValues.size()).getType(); // Deal with Hive's special NULL partition key. if (partitionKey.equals(nullPartitionKeyValue_)) { keyValues.add(NullLiteral.create(type)); } else { try { keyValues.add(LiteralExpr.create(partitionKey, type)); } catch (Exception ex) { LOG.warn("Failed to create literal expression of type: " + type, ex); throw new CatalogException("Invalid partition key value of type: " + type, ex); } } } try { Expr.analyze(keyValues, null); } catch (AnalysisException e) { // should never happen throw new IllegalStateException(e); } } try { // Each partition could reside on a different filesystem. FileSystem fs = partDirPath.getFileSystem(CONF); multipleFileSystems_ = multipleFileSystems_ || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs); if (fs.exists(partDirPath)) { // FileSystem does not have an API that takes in a timestamp and returns a list // of files that has been added/changed since. Therefore, we are calling // fs.listStatus() to list all the files. for (FileStatus fileStatus : fs.listStatus(partDirPath)) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. continue; } String partitionDir = fileStatus.getPath().getParent().toString(); FileDescriptor fd = null; // Search for a FileDescriptor with the same partition dir and file name. If one // is found, it will be chosen as a candidate to reuse. if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) { for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) { if (oldFileDesc.getFileName().equals(fileName)) { fd = oldFileDesc; break; } } } // Check if this FileDescriptor has been modified since last loading its block // location information. If it has not been changed, the previously loaded // value can be reused. if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen() || fd.getModificationTime() != fileStatus.getModificationTime()) { // Create a new file descriptor, the block metadata will be populated by // loadBlockMd. fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime()); addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd); } List<FileDescriptor> fds = fileDescMap_.get(partitionDir); if (fds == null) { fds = Lists.newArrayList(); fileDescMap_.put(partitionDir, fds); } fds.add(fd); // Add to the list of FileDescriptors for this partition. fileDescriptors.add(fd); } numHdfsFiles_ += fileDescriptors.size(); } HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor, fileDescriptors, getAvailableAccessLevel(fs, partDirPath)); partition.checkWellFormed(); return partition; } catch (Exception e) { throw new CatalogException("Failed to create partition: ", e); } }
From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java
License:Apache License
/** * List file status by calling abstractFileSystem.listStatusIterator. */// www .j a v a 2s. c o m private static void listStatusIterator(String dirPath) { Path path = new Path(dirPath); boolean exceptionThrown = false; try { AbstractFileSystem fs = AbstractFileSystem.createFileSystem(path.toUri(), LoadMetadataUtil.getConf()); RemoteIterator<FileStatus> iter = fs.listStatusIterator(path); while (iter.hasNext()) { FileStatus fileStatus = iter.next(); BlockLocation[] locations = fs.getFileBlockLocations(fileStatus.getPath(), 0, fileStatus.getLen()); for (BlockLocation loc : locations) { loc.getNames(); loc.getHosts(); } } } catch (IOException e) { exceptionThrown = true; LOG.error("Failed to list Status Iterator", e); } assertFalse(exceptionThrown); }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Performs a non-recursive delete of all visible (non-hidden) files in a given * directory. Returns the number of files deleted as part of this operation. *///from ww w . j av a2s .co m public static int deleteAllVisibleFiles(Path directory) throws IOException { FileSystem fs = directory.getFileSystem(CONF); Preconditions.checkState(fs.getFileStatus(directory).isDirectory()); int numFilesDeleted = 0; for (FileStatus fStatus : fs.listStatus(directory)) { // Only delete files that are not hidden. if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) { LOG.debug("Removing: " + fStatus.getPath()); fs.delete(fStatus.getPath(), false); ++numFilesDeleted; } } return numFilesDeleted; }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Returns the total number of visible (non-hidden) files in a directory. *///from w w w .jav a2s. c o m public static int getTotalNumVisibleFiles(Path directory) throws IOException { FileSystem fs = directory.getFileSystem(CONF); Preconditions.checkState(fs.getFileStatus(directory).isDirectory()); int numFiles = 0; for (FileStatus fStatus : fs.listStatus(directory)) { // Only delete files that are not hidden. if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) { ++numFiles; } } return numFiles; }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Moves all visible (non-hidden) files from a source directory to a destination * directory. Any sub-directories within the source directory are skipped. * Returns the number of files moved as part of this operation. *///from w ww . j av a 2 s . c om public static int moveAllVisibleFiles(Path sourceDir, Path destDir) throws IOException { FileSystem fs = destDir.getFileSystem(CONF); Preconditions.checkState(fs.isDirectory(destDir)); Preconditions.checkState(fs.isDirectory(sourceDir)); // Use the same UUID to resolve all file name conflicts. This helps mitigate problems // that might happen if there is a conflict moving a set of files that have // dependent file names. For example, foo.lzo and foo.lzo_index. UUID uuid = UUID.randomUUID(); // Enumerate all the files in the source int numFilesMoved = 0; for (FileStatus fStatus : fs.listStatus(sourceDir)) { if (fStatus.isDirectory()) { LOG.debug("Skipping copy of directory: " + fStatus.getPath()); continue; } else if (isHiddenFile(fStatus.getPath().getName())) { continue; } Path destFile = new Path(destDir, fStatus.getPath().getName()); if (fs.exists(destFile)) { destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), uuid.toString())); } FileSystemUtil.moveFile(fStatus.getPath(), destFile, false); ++numFilesMoved; } return numFilesMoved; }
From source file:com.cloudera.impala.util.LoadMetadataUtil.java
License:Apache License
/** * Load and return a list of file descriptors for the files in 'dirPath', using the * listStatus HDFS API in filesystem to load filestatus. It will not load the file * descriptor if the file is a directory, or hidden file starting with . or _, or LZO * index files. If the file can be found in the old File description map and not * modified, and not 'isMarkedCached' - partition marked as cached, just reuse the one * in cache. Otherwise it will create a new File description with filename, file length * and modification time.//ww w . j a va2s.com * * Must be threadsafe. Access to 'oldFileDescMap', 'perFsFileBlocks', 'hostIndex' and * 'fileDescMap' must be protected. */ public static List<FileDescriptor> loadFileDescriptors(FileSystem fs, Path dirPath, Map<String, List<FileDescriptor>> oldFileDescMap, HdfsFileFormat fileFormat, Map<FsKey, FileBlocksInfo> perFsFileBlocks, boolean isMarkedCached, String tblName, ListMap<TNetworkAddress> hostIndex, Map<String, List<FileDescriptor>> fileDescMap) throws FileNotFoundException, IOException { List<FileDescriptor> fileDescriptors = Lists.newArrayList(); for (FileStatus fileStatus : fs.listStatus(dirPath)) { FileDescriptor fd = getFileDescriptor(fs, fileStatus, fileFormat, oldFileDescMap, isMarkedCached, perFsFileBlocks, tblName, hostIndex); if (fd == null) continue; // Add partition dir to fileDescMap if it does not exist. String partitionDir = fileStatus.getPath().getParent().toString(); synchronized (fileDescMap) { if (!fileDescMap.containsKey(partitionDir)) { fileDescMap.put(partitionDir, new ArrayList<FileDescriptor>()); } fileDescMap.get(partitionDir).add(fd); } // Add to the list of FileDescriptors for this partition. fileDescriptors.add(fd); } return fileDescriptors; }
From source file:com.cloudera.impala.util.LoadMetadataUtil.java
License:Apache License
/** * Identical to loadFileDescriptors, except using the ListStatusIterator HDFS API to * load file status./*ww w . jav a 2 s . c om*/ */ public static List<FileDescriptor> loadViaListStatusIterator(FileSystem fs, Path partDirPath, Map<String, List<FileDescriptor>> oldFileDescMap, HdfsFileFormat fileFormat, Map<FsKey, FileBlocksInfo> perFsFileBlocks, boolean isMarkedCached, String tblName, ListMap<TNetworkAddress> hostIndex, Map<String, List<FileDescriptor>> fileDescMap) throws FileNotFoundException, IOException { List<FileDescriptor> fileDescriptors = Lists.newArrayList(); AbstractFileSystem abstractFs = AbstractFileSystem.createFileSystem(partDirPath.toUri(), CONF); RemoteIterator<FileStatus> fileStatusItor = abstractFs.listStatusIterator(partDirPath); while (fileStatusItor.hasNext()) { FileStatus fileStatus = fileStatusItor.next(); FileDescriptor fd = getFileDescriptor(fs, fileStatus, fileFormat, oldFileDescMap, isMarkedCached, perFsFileBlocks, tblName, hostIndex); if (fd == null) continue; // Add partition dir to fileDescMap if it does not exist. String partitionDir = fileStatus.getPath().getParent().toString(); if (!fileDescMap.containsKey(partitionDir)) { fileDescMap.put(partitionDir, new ArrayList<FileDescriptor>()); } fileDescMap.get(partitionDir).add(fd); // Add to the list of FileDescriptors for this partition. fileDescriptors.add(fd); } return fileDescriptors; }
From source file:com.cloudera.impala.util.LoadMetadataUtil.java
License:Apache License
/** * Get file descriptor according to fileStatus and oldFileDescMap. It will return null * if the file is a directory, or hidden file starting with . or _, or LZO index files. * If the file can be found in the old File description map and not modified, and not * 'isMarkedCached' - partition marked as cached, just reuse the one in cache. Otherwise * it will create a new File description with filename, file length and modification * time.//w w w. jav a 2 s.c o m * * Must be thread safe. Access to 'oldFileDescMap', 'perFsFileBlocks' and 'hostIndex' * must be protected. */ private static FileDescriptor getFileDescriptor(FileSystem fs, FileStatus fileStatus, HdfsFileFormat fileFormat, Map<String, List<FileDescriptor>> oldFileDescMap, boolean isMarkedCached, Map<FsKey, FileBlocksInfo> perFsFileBlocks, String tblName, ListMap<TNetworkAddress> hostIndex) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. return null; } String partitionDir = fileStatus.getPath().getParent().toString(); FileDescriptor fd = null; // Search for a FileDescriptor with the same partition dir and file name. If one // is found, it will be chosen as a candidate to reuse. if (oldFileDescMap != null) { synchronized (oldFileDescMap) { if (oldFileDescMap.get(partitionDir) != null) { for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) { // TODO: This doesn't seem like the right data structure if a directory has // a lot of files. if (oldFileDesc.getFileName().equals(fileName)) { fd = oldFileDesc; break; } } } } } // Check if this FileDescriptor has been modified since last loading its block // location information. If it has not been changed, the previously loaded value can // be reused. if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen() || fd.getModificationTime() != fileStatus.getModificationTime()) { // Create a new file descriptor and load the file block metadata, // collecting the block metadata into perFsFileBlocks. The disk IDs for // all the blocks of each filesystem will be loaded by loadDiskIds(). fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime()); loadBlockMetadata(fs, fileStatus, fd, fileFormat, perFsFileBlocks, tblName, hostIndex); } return fd; }