List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:com.cloudera.impala.catalog.HdfsTable.java
License:Apache License
/** * Creates a new HdfsPartition object to be added to the internal partition list. * Populates with file format information and file locations. Partitions may be empty, * or may not even exist on the file system (a partition's location may have been * changed to a new path that is about to be created by an INSERT). For unchanged * files (indicated by unchanged mtime), reuses the FileDescriptor from the * oldFileDescMap. The one exception is if the partition is marked as cached * in which case the block metadata cannot be reused. Otherwise, creates a new * FileDescriptor for each modified or new file and adds it to newFileDescMap. * Both old and newFileDescMap are Maps of parent directory (partition location) * to list of files (FileDescriptors) under that directory. * Returns new partition if successful or null if none was added. * Separated from addPartition to reduce the number of operations done * while holding the lock on the hdfs table. // w w w. j a v a2 s . co m * @throws CatalogException * if the supplied storage descriptor contains metadata that Impala can't * understand. */ private HdfsPartition createPartition(StorageDescriptor storageDescriptor, org.apache.hadoop.hive.metastore.api.Partition msPartition, Map<String, List<FileDescriptor>> oldFileDescMap, Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException { HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor); Path partDirPath = new Path(storageDescriptor.getLocation()); List<FileDescriptor> fileDescriptors = Lists.newArrayList(); // If the partition is marked as cached, the block location metadata must be // reloaded, even if the file times have not changed. boolean isMarkedCached = isMarkedCached_; List<LiteralExpr> keyValues = Lists.newArrayList(); if (msPartition != null) { isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null; // Load key values for (String partitionKey : msPartition.getValues()) { Type type = getColumns().get(keyValues.size()).getType(); // Deal with Hive's special NULL partition key. if (partitionKey.equals(nullPartitionKeyValue_)) { keyValues.add(NullLiteral.create(type)); } else { try { keyValues.add(LiteralExpr.create(partitionKey, type)); } catch (Exception ex) { LOG.warn("Failed to create literal expression of type: " + type, ex); throw new CatalogException("Invalid partition key value of type: " + type, ex); } } } try { Expr.analyze(keyValues, null); } catch (AnalysisException e) { // should never happen throw new IllegalStateException(e); } } try { // Each partition could reside on a different filesystem. FileSystem fs = partDirPath.getFileSystem(CONF); multipleFileSystems_ = multipleFileSystems_ || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs); if (fs.exists(partDirPath)) { // FileSystem does not have an API that takes in a timestamp and returns a list // of files that has been added/changed since. Therefore, we are calling // fs.listStatus() to list all the files. for (FileStatus fileStatus : fs.listStatus(partDirPath)) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. continue; } String partitionDir = fileStatus.getPath().getParent().toString(); FileDescriptor fd = null; // Search for a FileDescriptor with the same partition dir and file name. If one // is found, it will be chosen as a candidate to reuse. if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) { for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) { if (oldFileDesc.getFileName().equals(fileName)) { fd = oldFileDesc; break; } } } // Check if this FileDescriptor has been modified since last loading its block // location information. If it has not been changed, the previously loaded // value can be reused. if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen() || fd.getModificationTime() != fileStatus.getModificationTime()) { // Create a new file descriptor, the block metadata will be populated by // loadBlockMd. fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime()); addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd); } List<FileDescriptor> fds = fileDescMap_.get(partitionDir); if (fds == null) { fds = Lists.newArrayList(); fileDescMap_.put(partitionDir, fds); } fds.add(fd); // Add to the list of FileDescriptors for this partition. fileDescriptors.add(fd); } numHdfsFiles_ += fileDescriptors.size(); } HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor, fileDescriptors, getAvailableAccessLevel(fs, partDirPath)); partition.checkWellFormed(); return partition; } catch (Exception e) { throw new CatalogException("Failed to create partition: ", e); } }
From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java
License:Apache License
/** * List file status by calling fileSystem.listStatus. *///from w ww . j a v a 2s. c om private static void listStatus(String dirPath) { Path path = new Path(dirPath); boolean exceptionThrown = false; try { FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf()); FileStatus[] fileStatus = fs.listStatus(path); if (fs.exists(path)) { for (FileStatus status : fileStatus) { BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation loc : locations) { loc.getNames(); loc.getHosts(); } } } } catch (IOException e) { exceptionThrown = true; LOG.error("Failed to list Status", e); } assertFalse(exceptionThrown); }
From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java
License:Apache License
/** * List file status by calling fileSystem.listLocatedStatus. */// ww w. j ava2 s . c om private static void listLocatedStatus(String dirPath) { Path path = new Path(dirPath); boolean exceptionThrown = false; try { FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf()); RemoteIterator<LocatedFileStatus> iterator = fs.listLocatedStatus(path); if (fs.exists(path)) { while (iterator.hasNext()) { LocatedFileStatus fileStatus = iterator.next(); BlockLocation[] locations = fileStatus.getBlockLocations(); for (BlockLocation loc : locations) { loc.getHosts(); loc.getNames(); } } } } catch (IOException e) { exceptionThrown = true; LOG.error("Failed to list Located Status", e); } assertFalse(exceptionThrown); }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Moves all visible (non-hidden) files from a source directory to a destination * directory. Any sub-directories within the source directory are skipped. * Returns the number of files moved as part of this operation. *//*ww w . j ava 2 s .c o m*/ public static int moveAllVisibleFiles(Path sourceDir, Path destDir) throws IOException { FileSystem fs = destDir.getFileSystem(CONF); Preconditions.checkState(fs.isDirectory(destDir)); Preconditions.checkState(fs.isDirectory(sourceDir)); // Use the same UUID to resolve all file name conflicts. This helps mitigate problems // that might happen if there is a conflict moving a set of files that have // dependent file names. For example, foo.lzo and foo.lzo_index. UUID uuid = UUID.randomUUID(); // Enumerate all the files in the source int numFilesMoved = 0; for (FileStatus fStatus : fs.listStatus(sourceDir)) { if (fStatus.isDirectory()) { LOG.debug("Skipping copy of directory: " + fStatus.getPath()); continue; } else if (isHiddenFile(fStatus.getPath().getName())) { continue; } Path destFile = new Path(destDir, fStatus.getPath().getName()); if (fs.exists(destFile)) { destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), uuid.toString())); } FileSystemUtil.moveFile(fStatus.getPath(), destFile, false); ++numFilesMoved; } return numFilesMoved; }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Moves (renames) the given file to a new location (either another directory or a * file. If renameIfAlreadyExists is true, no error will be thrown if a file with the * same name already exists in the destination location. Instead, a UUID will be * appended to the base file name, preserving the the existing file extension. * If renameIfAlreadyExists is false, an IOException will be thrown if there is a * file name conflict./*from w w w . j a v a2 s . c o m*/ */ public static void moveFile(Path sourceFile, Path dest, boolean renameIfAlreadyExists) throws IOException { FileSystem fs = dest.getFileSystem(CONF); Path destFile = fs.isDirectory(dest) ? new Path(dest, sourceFile.getName()) : dest; // If a file with the same name does not already exist in the destination location // then use the same file name. Otherwise, generate a unique file name. if (renameIfAlreadyExists && fs.exists(destFile)) { Path destDir = fs.isDirectory(dest) ? dest : dest.getParent(); destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), UUID.randomUUID().toString())); } LOG.debug(String.format("Moving '%s' to '%s'", sourceFile.toString(), destFile.toString())); // Move (rename) the file. fs.rename(sourceFile, destFile); }
From source file:com.cloudera.nav.plugin.client.writer.MetadataWriterFactory.java
License:Apache License
private OutputStream createHdfsStream(PluginConfigurations config) { try {/*from w ww . j a va2s .c o m*/ FileSystem fs = FileSystem.get(config.getHadoopConfigurations()); Path path = new Path(getFilePath(config.getMetadataParentUriString())); if (fs.exists(path)) { return fs.append(path); } // TODO block sizes, replication counts etc return fs.create(path); } catch (IOException e) { throw Throwables.propagate(e); } }
From source file:com.cloudera.nav.sdk.client.writer.MetadataWriterFactory.java
License:Apache License
private OutputStream createHdfsStream() { try {/*from w w w . jav a 2 s . co m*/ FileSystem fs = FileSystem.get(config.getHadoopConfigurations()); Path path = new Path(getFilePath(config.getMetadataParentUriString())); if (fs.exists(path)) { return fs.append(path); } // TODO block sizes, replication counts etc return fs.create(path); } catch (IOException e) { throw Throwables.propagate(e); } }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData, JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString, TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException { Objects.requireNonNull(newKeyMessageData); JavaRDD<M> newData = newKeyMessageData.values(); JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values(); if (newData != null) { newData.cache();/*w ww . j a v a2 s. c om*/ // This forces caching of the RDD. This shouldn't be necessary but we see some freezes // when many workers try to materialize the RDDs at once. Hence the workaround. newData.foreachPartition(p -> { }); } if (pastData != null) { pastData.cache(); pastData.foreachPartition(p -> { }); } List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues(); int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates); List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates, valuesPerHyperParam); Path modelDir = new Path(modelDirString); Path tempModelPath = new Path(modelDir, ".temporary"); Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis())); FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration()); fs.mkdirs(candidatesPath); Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos, candidatesPath); Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis())); if (bestCandidatePath == null) { log.info("Unable to build any model"); } else { // Move best model into place fs.rename(bestCandidatePath, finalPath); } // Then delete everything else fs.delete(candidatesPath, true); if (modelUpdateTopic == null) { log.info("No update topic configured, not publishing models to a topic"); } else { // Push PMML model onto update topic, if it exists Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME); if (fs.exists(bestModelPath)) { FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath); PMML bestModel = null; boolean modelNeededForUpdates = canPublishAdditionalModelData(); boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize; if (modelNeededForUpdates || modelNotTooLarge) { // Either the model is required for publishAdditionalModelData, or required because it's going to // be serialized to Kafka try (InputStream in = fs.open(bestModelPath)) { bestModel = PMMLUtils.read(in); } } if (modelNotTooLarge) { modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel)); } else { modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString()); } if (modelNeededForUpdates) { publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic); } } } if (newData != null) { newData.unpersist(); } if (pastData != null) { pastData.unpersist(); } }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
private Path findBestCandidatePath(JavaSparkContext sparkContext, JavaRDD<M> newData, JavaRDD<M> pastData, List<List<?>> hyperParameterCombos, Path candidatesPath) throws IOException { Map<Path, Double> pathToEval = ExecUtils.collectInParallel(candidates, Math.min(evalParallelism, candidates), true, i -> buildAndEval(i, hyperParameterCombos, sparkContext, newData, pastData, candidatesPath), Collectors.toMap(Pair::getFirst, Pair::getSecond)); FileSystem fs = null; Path bestCandidatePath = null; double bestEval = Double.NEGATIVE_INFINITY; for (Map.Entry<Path, Double> pathEval : pathToEval.entrySet()) { Path path = pathEval.getKey(); if (fs == null) { fs = FileSystem.get(path.toUri(), sparkContext.hadoopConfiguration()); }/*from w ww . ja v a 2 s. c o m*/ if (path != null && fs.exists(path)) { Double eval = pathEval.getValue(); if (!Double.isNaN(eval)) { // Valid evaluation; if it's the best so far, keep it if (eval > bestEval) { log.info("Best eval / model path is now {} / {}", eval, path); bestEval = eval; bestCandidatePath = path; } } else if (bestCandidatePath == null && testFraction == 0.0) { // Normal case when eval is disabled; no eval is possible, but keep the one model // that was built bestCandidatePath = path; } } // else can't do anything; no model at all } if (threshold != null && bestEval < threshold) { log.info("Best model at {} had eval {}, but did not exceed threshold {}; discarding model", bestCandidatePath, bestEval, threshold); bestCandidatePath = null; } return bestCandidatePath; }
From source file:com.cloudera.recordbreaker.analyzer.DataQuery.java
License:Open Source License
String grabTable(DataDescriptor desc) throws SQLException, IOException { // Set up Hive table Path p = desc.getFilename();//from ww w. j av a 2s . co m String tablename = tableCache.get(p); if (tablename == null) { tablename = "datatable" + Math.abs(r.nextInt()); Statement stmt = hiveCon.createStatement(); try { String creatTxt = desc.getHiveCreateTableStatement(tablename); LOG.info("Create: " + creatTxt); stmt.execute(creatTxt); tables.put(p, tablename); } finally { stmt.close(); } // Copy avro version of data into secret location prior to Hive import FileSystem fs = FileSystem.get(conf); Path tmpTables = new Path(tmpTablesDir); if (!fs.exists(tmpTables)) { fs.mkdirs(tmpTables, new FsPermission("-rwxrwxrwx")); } Path secretDst = new Path(tmpTables, "r" + r.nextInt()); LOG.info("Preparing Avro data at " + secretDst); desc.prepareAvroFile(fs, fs, secretDst, conf); fs.setPermission(secretDst, new FsPermission("-rwxrwxrwx")); // Import data stmt = hiveCon.createStatement(); try { LOG.info("Import data into Hive: " + desc.getHiveImportDataStatement(tablename, secretDst)); stmt.execute(desc.getHiveImportDataStatement(tablename, secretDst)); isLoaded.add(p); } finally { stmt.close(); } // Refresh impala metadata stmt = impalaCon.createStatement(); try { try { LOG.info("Rebuilding Impala metadata..."); stmt.execute("INVALIDATE METADATA"); } catch (Exception iex) { LOG.info("Impala metadata rebuild failed: " + iex.toString()); } } finally { stmt.close(); } // Insert into table cache tableCache.put(p, tablename); } return tablename; }