List of usage examples for org.apache.hadoop.fs FileStatus isDirectory
public boolean isDirectory()
From source file:com.blackberry.logdriver.admin.LogMaintenance.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // If run by Oozie, then load the Oozie conf too if (System.getProperty("oozie.action.conf.xml") != null) { conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml"))); }//from w w w . j av a 2s .c om // For some reason, Oozie needs some options to be set in system instead of // in the confiuration. So copy the configs over. { Iterator<Entry<String, String>> i = conf.iterator(); while (i.hasNext()) { Entry<String, String> next = i.next(); System.setProperty(next.getKey(), next.getValue()); } } if (args.length < 3) { printUsage(); return 1; } String userName = args[0]; String dcNumber = args[1]; String service = args[2]; String date = null; String hour = null; if (args.length >= 4) { date = args[3]; } if (args.length >= 5) { hour = args[4]; } // Set from environment variables String mergeJobPropertiesFile = getConfOrEnv(conf, "MERGEJOB_CONF"); String filterJobPropertiesFile = getConfOrEnv(conf, "FILTERJOB_CONF"); String daysBeforeArchive = getConfOrEnv(conf, "DAYS_BEFORE_ARCHIVE"); String daysBeforeDelete = getConfOrEnv(conf, "DAYS_BEFORE_DELETE"); String maxConcurrentMR = getConfOrEnv(conf, "MAX_CONCURRENT_MR", "-1"); String zkConnectString = getConfOrEnv(conf, "ZK_CONNECT_STRING"); String logdir = getConfOrEnv(conf, "logdriver.logdir.name"); boolean resetOrphanedJobs = Boolean.parseBoolean(getConfOrEnv(conf, "reset.orphaned.jobs", "true")); String rootDir = getConfOrEnv(conf, "service.root.dir"); String maxTotalMR = getConfOrEnv(conf, "MAX_TOTAL_MR", "-1"); boolean doMerge = true; boolean doArchive = true; boolean doDelete = true; if (zkConnectString == null) { LOG.error("ZK_CONNECT_STRING is not set. Exiting."); return 1; } if (mergeJobPropertiesFile == null) { LOG.info("MERGEJOB_CONF is not set. Not merging."); doMerge = false; } if (filterJobPropertiesFile == null) { LOG.info("FILTERJOB_CONF is not set. Not archiving."); doArchive = false; } if (daysBeforeArchive == null) { LOG.info("DAYS_BEFORE_ARCHIVE is not set. Not archiving."); doArchive = false; } if (doArchive && Integer.parseInt(daysBeforeArchive) < 0) { LOG.info("DAYS_BEFORE_ARCHIVE is negative. Not archiving."); doArchive = false; } if (daysBeforeDelete == null) { LOG.info("DAYS_BEFORE_DELETE is not set. Not deleting."); doDelete = false; } if (doDelete && Integer.parseInt(daysBeforeDelete) < 0) { LOG.info("DAYS_BEFORE_DELETE is negative. Not deleting."); doDelete = false; } if (logdir == null) { LOG.info("LOGDRIVER_LOGDIR_NAME is not set. Using default value of 'logs'."); logdir = "logs"; } if (rootDir == null) { LOG.info("SERVICE_ROOT_DIR is not set. Using default value of 'service'."); rootDir = "/service"; } // We can hang if this fails. So make sure we abort if it fails. fs = null; try { fs = FileSystem.get(conf); fs.exists(new Path("/")); // Test if it works. } catch (IOException e) { LOG.error("Error getting filesystem.", e); return 1; } // Create the LockUtil instance lockUtil = new LockUtil(zkConnectString); // Now it's safe to create our Job Runner JobRunner jobRunner = new JobRunner(Integer.parseInt(maxConcurrentMR), Integer.parseInt(maxTotalMR)); Thread jobRunnerThread = new Thread(jobRunner); jobRunnerThread.setName("JobRunner"); jobRunnerThread.setDaemon(false); jobRunnerThread.start(); // Figure out what date we start filters on. String filterCutoffDate = ""; if (doArchive) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeArchive)); filterCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Archiving logs from before {}", filterCutoffDate); } String deleteCutoffDate = ""; if (doDelete) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeDelete)); deleteCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Deleting logs from before {}", deleteCutoffDate); } long now = System.currentTimeMillis(); // Various exceptions have been popping up here. So make sure I catch them // all. try { // Patterns to recognize hour, day and incoming directories, so that they // can be processed. Pattern datePathPattern; Pattern hourPathPattern; Pattern incomingPathPattern; Pattern dataPathPattern; Pattern archivePathPattern; Pattern workingPathPattern; if (hour != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/working/([^/]+)_(\\d+)"); } else if (date != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } else { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})"); incomingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } // Do a depth first search of the directory, processing anything that // looks // interesting along the way Deque<Path> paths = new ArrayDeque<Path>(); Path rootPath = new Path(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/"); paths.push(rootPath); while (paths.size() > 0) { Path p = paths.pop(); LOG.debug("{}", p.toString()); if (!fs.exists(p)) { continue; } FileStatus dirStatus = fs.getFileStatus(p); FileStatus[] children = fs.listStatus(p); boolean addChildren = true; boolean old = dirStatus.getModificationTime() < now - WAIT_TIME; LOG.debug(" Was last modified {}ms ago", now - dirStatus.getModificationTime()); if (!old) { LOG.debug(" Skipping, since it's not old enough."); } else if ((!rootPath.equals(p)) && (children.length == 0 || (children.length == 1 && children[0].getPath().getName().equals(READY_MARKER)))) { // old and no children? Delete! LOG.info(" Deleting empty directory {}", p.toString()); fs.delete(p, true); } else { Matcher matcher = datePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking date directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDirectory() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } matcher = hourPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking hour directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDirectory() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } // Check to see if we have to run a merge matcher = incomingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking incoming directory"); String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doMerge) { // old, looks right, and has children? Run it! boolean hasMatchingChildren = false; boolean subdirTooYoung = false; for (FileStatus child : children) { if (!hasMatchingChildren) { FileStatus[] grandchildren = fs.listStatus(child.getPath()); for (FileStatus gc : grandchildren) { if (VALID_FILE.matcher(gc.getPath().getName()).matches()) { hasMatchingChildren = true; break; } } } if (!subdirTooYoung) { if (child.getModificationTime() >= now - WAIT_TIME) { subdirTooYoung = true; LOG.debug(" Subdir {} is too young.", child.getPath()); } } } if (!hasMatchingChildren) { LOG.debug(" No files match the expected pattern ({})", VALID_FILE.pattern()); } if (hasMatchingChildren && !subdirTooYoung) { LOG.info(" Run Merge job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); Properties jobProps = new Properties(); jobProps.load(new FileInputStream(mergeJobPropertiesFile)); jobProps.setProperty("jobType", "merge"); jobProps.setProperty("rootDir", rootDir); jobProps.setProperty("dcNumber", dcNumber); jobProps.setProperty("service", service); jobProps.setProperty("date", matchDate); jobProps.setProperty("hour", matchHour); jobProps.setProperty("component", matchComponent); jobProps.setProperty("user.name", userName); jobProps.setProperty("logdir", logdir); jobRunner.submit(jobProps); addChildren = false; } } } // Check to see if we need to run a filter and archive matcher = dataPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doArchive && timestamp.compareTo(filterCutoffDate) < 0) { Properties jobProps = new Properties(); jobProps.load(new FileInputStream(filterJobPropertiesFile)); jobProps.setProperty("jobType", "filter"); jobProps.setProperty("rootDir", rootDir); jobProps.setProperty("dcNumber", dcNumber); jobProps.setProperty("service", service); jobProps.setProperty("date", matchDate); jobProps.setProperty("hour", matchHour); jobProps.setProperty("component", matchComponent); jobProps.setProperty("user.name", userName); jobProps.setProperty("logdir", logdir); // Check to see if we should just keep all or delete all here. // The filter file should be here String appPath = jobProps.getProperty("oozie.wf.application.path"); appPath = appPath.replaceFirst("\\$\\{.*?\\}", ""); Path filterFile = new Path( appPath + "/" + conf.get("filter.definition.file", service + ".yaml")); LOG.info("Filter file is {}", filterFile); if (fs.exists(filterFile)) { List<BoomFilterMapper.Filter> filters = BoomFilterMapper.loadFilters(matchComponent, fs.open(filterFile)); if (filters == null) { LOG.warn( " Got null when getting filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 0) { LOG.warn(" Got no filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.KeepAllFilter) { LOG.info(" Keeping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); // Move files from data to archive // delete it all! String destination = rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent + "/archive/"; PathInfo pathInfo = new PathInfo(); pathInfo.setDcNumber(dcNumber); pathInfo.setService(service); pathInfo.setLogdir(logdir); pathInfo.setDate(matchDate); pathInfo.setHour(matchHour); pathInfo.setComponent(matchComponent); try { lockUtil.acquireWriteLock(lockUtil.getLockPath(pathInfo)); fs.mkdirs(new Path(destination)); for (FileStatus f : fs.listStatus(p)) { fs.rename(f.getPath(), new Path(destination)); } } finally { lockUtil.releaseWriteLock(lockUtil.getLockPath(pathInfo)); } } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.DropAllFilter) { LOG.info(" Dropping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); PathInfo pathInfo = new PathInfo(); pathInfo.setDcNumber(dcNumber); pathInfo.setService(service); pathInfo.setLogdir(logdir); pathInfo.setDate(matchDate); pathInfo.setHour(matchHour); pathInfo.setComponent(matchComponent); try { lockUtil.acquireWriteLock(lockUtil.getLockPath(pathInfo)); fs.delete(p, true); } finally { lockUtil.releaseWriteLock(lockUtil.getLockPath(pathInfo)); } } else { LOG.info(" Run Filter/Archive job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); jobRunner.submit(jobProps); } } else { LOG.warn("Skipping filter job, since no filter file exists"); } addChildren = false; } } matcher = archivePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } } matcher = workingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.info(" Matches working pattern ({})", p); if (resetOrphanedJobs) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); // Move everything from working/xxx/incoming/ to incoming/ PathInfo lockPathInfo = new PathInfo(logdir, rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent); lockUtil.acquireWriteLock(lockUtil.getLockPath(lockPathInfo)); FileStatus[] fileStatuses = fs.listStatus(new Path(p.toUri().getPath() + "/incoming/")); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { Path toPath = new Path( fileStatus.getPath().getParent().getParent().getParent().getParent(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" Moving data from {} to {}", fileStatus.getPath(), toPath); LOG.info(" mkdir {}", toPath); fs.mkdirs(toPath); Path fromDir = new Path(p.toUri().getPath(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" moving from {}", fromDir); FileStatus[] files = fs.listStatus(fromDir); if (files == null || files.length == 0) { LOG.info(" Nothing to move from {}", fromDir); } else { for (FileStatus f : files) { LOG.info(" rename {} {}", f.getPath(), new Path(toPath, f.getPath().getName())); fs.rename(f.getPath(), new Path(toPath, f.getPath().getName())); } } LOG.info(" rm {}", fileStatus.getPath()); fs.delete(fileStatus.getPath(), true); } lockUtil.releaseWriteLock(lockUtil.getLockPath(lockPathInfo)); fs.delete(new Path(p.toUri().getPath()), true); } } addChildren = false; } } // Add any children which are directories to the stack. if (addChildren) { for (int i = children.length - 1; i >= 0; i--) { FileStatus child = children[i]; if (child.isDirectory()) { paths.push(child.getPath()); } } } } // Since we may have deleted a bunch of directories, delete any unused // locks // from ZooKeeper. { LOG.info("Checking for unused locks in ZooKeeper"); String scanPath = rootDir + "/" + dcNumber + "/" + service + "/" + logdir; if (date != null) { scanPath += "/" + date; if (hour != null) { scanPath += "/" + hour; } } List<LockInfo> lockInfo = lockUtil.scan(scanPath); for (LockInfo li : lockInfo) { // Check if the lock path still exists in HDFS. If it doesn't, then // delete it from ZooKeeper. String path = li.getPath(); String hdfsPath = path.substring(LockUtil.ROOT.length()); if (!fs.exists(new Path(hdfsPath))) { ZooKeeper zk = lockUtil.getZkClient(); while (!path.equals(LockUtil.ROOT)) { try { zk.delete(path, -1); } catch (KeeperException.NotEmptyException e) { // That's fine. just stop trying then. break; } catch (Exception e) { LOG.error("Caught exception trying to delete from ZooKeeper.", e); break; } LOG.info("Deleted from ZooKeeper: {}", path); path = path.substring(0, path.lastIndexOf('/')); } } } } // Now that we're done, wait for the Oozie Runner to stop, and print the // results. LOG.info("Waiting for Oozie jobs to complete."); jobRunner.shutdown(); jobRunnerThread.join(); LOG.info("Job Stats : Started={} Succeeded={} failed={} errors={}", new Object[] { jobRunner.getStarted(), jobRunner.getSucceeded(), jobRunner.getFailed(), jobRunner.getErrors() }); lockUtil.close(); } catch (Exception e) { LOG.error("Unexpected exception caught.", e); return 1; } return 0; }
From source file:com.cloudera.cdk.data.filesystem.FileSystemDataset.java
License:Apache License
@Deprecated void accumulateDatafilePaths(Path directory, List<Path> paths) throws IOException { for (FileStatus status : fileSystem.listStatus(directory, PathFilters.notHidden())) { if (status.isDirectory()) { accumulateDatafilePaths(status.getPath(), paths); } else {//from w w w . j a va2 s . c o m paths.add(status.getPath()); } } }
From source file:com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider.java
License:Apache License
@Override public List<String> list() { List<String> datasets = Lists.newArrayList(); try {//from w w w .ja v a2s .c o m FileStatus[] entries = rootFileSystem.listStatus(rootDirectory, PathFilters.notHidden()); for (FileStatus entry : entries) { // assumes that all unhidden directories under the root are data sets if (entry.isDirectory() && rootFileSystem.exists(new Path(entry.getPath(), ".metadata"))) { // may want to add a check: !RESERVED_NAMES.contains(name) datasets.add(entry.getPath().getName()); } else { continue; } } } catch (FileNotFoundException ex) { // the repo hasn't created any files yet return datasets; } catch (IOException ex) { throw new MetadataProviderException("Could not list data sets", ex); } return datasets; }
From source file:com.cloudera.data.filesystem.FileSystemDataset.java
License:Apache License
private void accumulateDatafilePaths(Path directory, List<Path> paths) throws IOException { for (FileStatus status : fileSystem.listStatus(directory, PathFilters.notHidden())) { if (status.isDirectory()) { accumulateDatafilePaths(status.getPath(), paths); } else {//from ww w . j a v a 2 s .co m paths.add(status.getPath()); } } }
From source file:com.cloudera.impala.catalog.HdfsTable.java
License:Apache License
/** * Loads the file block metadata for the given collection of FileDescriptors. The * FileDescriptors are passed as a tree, where the first level is indexed by * filesystem, the second level is indexed by partition location, and the leaves are * the list of files that exist under each directory. */// w w w.j av a2s. co m private void loadBlockMd(Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescs) throws RuntimeException { Preconditions.checkNotNull(perFsFileDescs); LOG.debug("load block md for " + name_); for (FsKey fsEntry : perFsFileDescs.keySet()) { FileSystem fs = fsEntry.filesystem; // Store all BlockLocations so they can be reused when loading the disk IDs. List<BlockLocation> blockLocations = Lists.newArrayList(); int numCachedBlocks = 0; Map<String, List<FileDescriptor>> partitionToFds = perFsFileDescs.get(fsEntry); Preconditions.checkNotNull(partitionToFds); // loop over all files and record their block metadata, minus volume ids for (String partitionDir : partitionToFds.keySet()) { Path partDirPath = new Path(partitionDir); for (FileDescriptor fileDescriptor : partitionToFds.get(partitionDir)) { Path p = new Path(partDirPath, fileDescriptor.getFileName()); try { FileStatus fileStatus = fs.getFileStatus(p); // fileDescriptors should not contain directories. Preconditions.checkArgument(!fileStatus.isDirectory()); BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); Preconditions.checkNotNull(locations); blockLocations.addAll(Arrays.asList(locations)); // Loop over all blocks in the file. for (BlockLocation block : locations) { String[] blockHostPorts = block.getNames(); try { blockHostPorts = block.getNames(); } catch (IOException e) { // this shouldn't happen, getNames() doesn't throw anything String errorMsg = "BlockLocation.getNames() failed:\n" + e.getMessage(); LOG.error(errorMsg); throw new IllegalStateException(errorMsg); } // Now enumerate all replicas of the block, adding any unknown hosts to // hostIndex_ and the index for that host to replicaHostIdxs. List<Integer> replicaHostIdxs = new ArrayList<Integer>(blockHostPorts.length); for (int i = 0; i < blockHostPorts.length; ++i) { String[] ip_port = blockHostPorts[i].split(":"); Preconditions.checkState(ip_port.length == 2); TNetworkAddress network_address = new TNetworkAddress(ip_port[0], Integer.parseInt(ip_port[1])); replicaHostIdxs.add(hostIndex_.getIndex(network_address)); } fileDescriptor.addFileBlock( new FileBlock(block.getOffset(), block.getLength(), replicaHostIdxs)); } } catch (IOException e) { throw new RuntimeException( "couldn't determine block locations for path '" + p + "':\n" + e.getMessage(), e); } } } if (SUPPORTS_VOLUME_ID && fs instanceof DistributedFileSystem) { LOG.trace("loading disk ids for: " + getFullName() + ". nodes: " + getNumNodes() + ". file system: " + fsEntry); loadDiskIds((DistributedFileSystem) fs, blockLocations, partitionToFds); LOG.trace("completed load of disk ids for: " + getFullName()); } } }
From source file:com.cloudera.impala.catalog.HdfsTable.java
License:Apache License
/** * Creates a new HdfsPartition object to be added to the internal partition list. * Populates with file format information and file locations. Partitions may be empty, * or may not even exist on the file system (a partition's location may have been * changed to a new path that is about to be created by an INSERT). For unchanged * files (indicated by unchanged mtime), reuses the FileDescriptor from the * oldFileDescMap. The one exception is if the partition is marked as cached * in which case the block metadata cannot be reused. Otherwise, creates a new * FileDescriptor for each modified or new file and adds it to newFileDescMap. * Both old and newFileDescMap are Maps of parent directory (partition location) * to list of files (FileDescriptors) under that directory. * Returns new partition if successful or null if none was added. * Separated from addPartition to reduce the number of operations done * while holding the lock on the hdfs table. //from w ww. j av a 2 s . c o m * @throws CatalogException * if the supplied storage descriptor contains metadata that Impala can't * understand. */ private HdfsPartition createPartition(StorageDescriptor storageDescriptor, org.apache.hadoop.hive.metastore.api.Partition msPartition, Map<String, List<FileDescriptor>> oldFileDescMap, Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException { HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor); Path partDirPath = new Path(storageDescriptor.getLocation()); List<FileDescriptor> fileDescriptors = Lists.newArrayList(); // If the partition is marked as cached, the block location metadata must be // reloaded, even if the file times have not changed. boolean isMarkedCached = isMarkedCached_; List<LiteralExpr> keyValues = Lists.newArrayList(); if (msPartition != null) { isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null; // Load key values for (String partitionKey : msPartition.getValues()) { Type type = getColumns().get(keyValues.size()).getType(); // Deal with Hive's special NULL partition key. if (partitionKey.equals(nullPartitionKeyValue_)) { keyValues.add(NullLiteral.create(type)); } else { try { keyValues.add(LiteralExpr.create(partitionKey, type)); } catch (Exception ex) { LOG.warn("Failed to create literal expression of type: " + type, ex); throw new CatalogException("Invalid partition key value of type: " + type, ex); } } } try { Expr.analyze(keyValues, null); } catch (AnalysisException e) { // should never happen throw new IllegalStateException(e); } } try { // Each partition could reside on a different filesystem. FileSystem fs = partDirPath.getFileSystem(CONF); multipleFileSystems_ = multipleFileSystems_ || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs); if (fs.exists(partDirPath)) { // FileSystem does not have an API that takes in a timestamp and returns a list // of files that has been added/changed since. Therefore, we are calling // fs.listStatus() to list all the files. for (FileStatus fileStatus : fs.listStatus(partDirPath)) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. continue; } String partitionDir = fileStatus.getPath().getParent().toString(); FileDescriptor fd = null; // Search for a FileDescriptor with the same partition dir and file name. If one // is found, it will be chosen as a candidate to reuse. if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) { for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) { if (oldFileDesc.getFileName().equals(fileName)) { fd = oldFileDesc; break; } } } // Check if this FileDescriptor has been modified since last loading its block // location information. If it has not been changed, the previously loaded // value can be reused. if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen() || fd.getModificationTime() != fileStatus.getModificationTime()) { // Create a new file descriptor, the block metadata will be populated by // loadBlockMd. fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime()); addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd); } List<FileDescriptor> fds = fileDescMap_.get(partitionDir); if (fds == null) { fds = Lists.newArrayList(); fileDescMap_.put(partitionDir, fds); } fds.add(fd); // Add to the list of FileDescriptors for this partition. fileDescriptors.add(fd); } numHdfsFiles_ += fileDescriptors.size(); } HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor, fileDescriptors, getAvailableAccessLevel(fs, partDirPath)); partition.checkWellFormed(); return partition; } catch (Exception e) { throw new CatalogException("Failed to create partition: ", e); } }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Moves all visible (non-hidden) files from a source directory to a destination * directory. Any sub-directories within the source directory are skipped. * Returns the number of files moved as part of this operation. *///from w ww.j a v a2s.co m public static int moveAllVisibleFiles(Path sourceDir, Path destDir) throws IOException { FileSystem fs = destDir.getFileSystem(CONF); Preconditions.checkState(fs.isDirectory(destDir)); Preconditions.checkState(fs.isDirectory(sourceDir)); // Use the same UUID to resolve all file name conflicts. This helps mitigate problems // that might happen if there is a conflict moving a set of files that have // dependent file names. For example, foo.lzo and foo.lzo_index. UUID uuid = UUID.randomUUID(); // Enumerate all the files in the source int numFilesMoved = 0; for (FileStatus fStatus : fs.listStatus(sourceDir)) { if (fStatus.isDirectory()) { LOG.debug("Skipping copy of directory: " + fStatus.getPath()); continue; } else if (isHiddenFile(fStatus.getPath().getName())) { continue; } Path destFile = new Path(destDir, fStatus.getPath().getName()); if (fs.exists(destFile)) { destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), uuid.toString())); } FileSystemUtil.moveFile(fStatus.getPath(), destFile, false); ++numFilesMoved; } return numFilesMoved; }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Returns true if the given Path contains any sub directories, otherwise false. */// w w w.j a v a2s . co m public static boolean containsSubdirectory(Path directory) throws FileNotFoundException, IOException { FileSystem fs = directory.getFileSystem(CONF); // Enumerate all the files in the source for (FileStatus fStatus : fs.listStatus(directory)) { if (fStatus.isDirectory()) { return true; } } return false; }
From source file:com.cloudera.impala.util.LoadMetadataUtil.java
License:Apache License
/** * Get file descriptor according to fileStatus and oldFileDescMap. It will return null * if the file is a directory, or hidden file starting with . or _, or LZO index files. * If the file can be found in the old File description map and not modified, and not * 'isMarkedCached' - partition marked as cached, just reuse the one in cache. Otherwise * it will create a new File description with filename, file length and modification * time./* w ww . j a v a2 s. com*/ * * Must be thread safe. Access to 'oldFileDescMap', 'perFsFileBlocks' and 'hostIndex' * must be protected. */ private static FileDescriptor getFileDescriptor(FileSystem fs, FileStatus fileStatus, HdfsFileFormat fileFormat, Map<String, List<FileDescriptor>> oldFileDescMap, boolean isMarkedCached, Map<FsKey, FileBlocksInfo> perFsFileBlocks, String tblName, ListMap<TNetworkAddress> hostIndex) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. return null; } String partitionDir = fileStatus.getPath().getParent().toString(); FileDescriptor fd = null; // Search for a FileDescriptor with the same partition dir and file name. If one // is found, it will be chosen as a candidate to reuse. if (oldFileDescMap != null) { synchronized (oldFileDescMap) { if (oldFileDescMap.get(partitionDir) != null) { for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) { // TODO: This doesn't seem like the right data structure if a directory has // a lot of files. if (oldFileDesc.getFileName().equals(fileName)) { fd = oldFileDesc; break; } } } } } // Check if this FileDescriptor has been modified since last loading its block // location information. If it has not been changed, the previously loaded value can // be reused. if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen() || fd.getModificationTime() != fileStatus.getModificationTime()) { // Create a new file descriptor and load the file block metadata, // collecting the block metadata into perFsFileBlocks. The disk IDs for // all the blocks of each filesystem will be loaded by loadDiskIds(). fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime()); loadBlockMetadata(fs, fileStatus, fd, fileFormat, perFsFileBlocks, tblName, hostIndex); } return fd; }
From source file:com.cloudera.impala.util.LoadMetadataUtil.java
License:Apache License
/** * Queries the filesystem to load the file block metadata (e.g. DFS blocks) for the * given file. Adds the newly created block metadata and block location to the * perFsFileBlocks, so that the storage IDs for each block can be retrieved from * BlockLocation./*from ww w . ja va2 s. c o m*/ * * Must be threadsafe. Access to 'perFsFileBlocks' and 'hostIndex' must be protected. */ private static void loadBlockMetadata(FileSystem fs, FileStatus file, FileDescriptor fd, HdfsFileFormat fileFormat, Map<FsKey, FileBlocksInfo> perFsFileBlocks, String tblName, ListMap<TNetworkAddress> hostIndex) { Preconditions.checkNotNull(fd); Preconditions.checkNotNull(perFsFileBlocks); Preconditions.checkArgument(!file.isDirectory()); LOG.debug("load block md for " + tblName + " file " + fd.getFileName()); if (!FileSystemUtil.hasGetFileBlockLocations(fs)) { synthesizeBlockMetadata(file, fd, fileFormat, hostIndex); return; } try { BlockLocation[] locations = null; if (file instanceof LocatedFileStatus) { locations = ((LocatedFileStatus) file).getBlockLocations(); } else { locations = fs.getFileBlockLocations(file, 0, file.getLen()); } Preconditions.checkNotNull(locations); // Loop over all blocks in the file. for (BlockLocation loc : locations) { Preconditions.checkNotNull(loc); fd.addFileBlock(createFileBlock(loc, hostIndex)); } // Remember the THdfsFileBlocks and corresponding BlockLocations. Once all the // blocks are collected, the disk IDs will be queried in one batch per filesystem. FsKey fsKey = new FsKey(fs); synchronized (perFsFileBlocks) { FileBlocksInfo infos = perFsFileBlocks.get(fsKey); if (infos == null) { infos = new FileBlocksInfo(); perFsFileBlocks.put(fsKey, infos); } infos.addBlocks(fd.getFileBlocks(), Arrays.asList(locations)); } } catch (IOException e) { throw new RuntimeException( "couldn't determine block locations for path '" + file.getPath() + "':\n" + e.getMessage(), e); } }