Example usage for org.apache.hadoop.fs FileSystem exists

List of usage examples for org.apache.hadoop.fs FileSystem exists

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem exists.

Prototype

public boolean exists(Path f) throws IOException 

Source Link

Document

Check if a path exists.

Usage

From source file:com.cloudera.impala.catalog.HdfsTable.java

License:Apache License

/**
 * Creates a new HdfsPartition object to be added to the internal partition list.
 * Populates with file format information and file locations. Partitions may be empty,
 * or may not even exist on the file system (a partition's location may have been
 * changed to a new path that is about to be created by an INSERT). For unchanged
 * files (indicated by unchanged mtime), reuses the FileDescriptor from the
 * oldFileDescMap. The one exception is if the partition is marked as cached
 * in which case the block metadata cannot be reused. Otherwise, creates a new
 * FileDescriptor for each modified or new file and adds it to newFileDescMap.
 * Both old and newFileDescMap are Maps of parent directory (partition location)
 * to list of files (FileDescriptors) under that directory.
 * Returns new partition if successful or null if none was added.
 * Separated from addPartition to reduce the number of operations done
 * while holding the lock on the hdfs table.
        // w w w.  j a  v a2  s  . co  m
 *  @throws CatalogException
 *    if the supplied storage descriptor contains metadata that Impala can't
 *    understand.
 */
private HdfsPartition createPartition(StorageDescriptor storageDescriptor,
        org.apache.hadoop.hive.metastore.api.Partition msPartition,
        Map<String, List<FileDescriptor>> oldFileDescMap,
        Map<FsKey, Map<String, List<FileDescriptor>>> perFsFileDescMap) throws CatalogException {
    HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_,
            storageDescriptor);
    Path partDirPath = new Path(storageDescriptor.getLocation());
    List<FileDescriptor> fileDescriptors = Lists.newArrayList();
    // If the partition is marked as cached, the block location metadata must be
    // reloaded, even if the file times have not changed.
    boolean isMarkedCached = isMarkedCached_;
    List<LiteralExpr> keyValues = Lists.newArrayList();
    if (msPartition != null) {
        isMarkedCached = HdfsCachingUtil.getCacheDirIdFromParams(msPartition.getParameters()) != null;
        // Load key values
        for (String partitionKey : msPartition.getValues()) {
            Type type = getColumns().get(keyValues.size()).getType();
            // Deal with Hive's special NULL partition key.
            if (partitionKey.equals(nullPartitionKeyValue_)) {
                keyValues.add(NullLiteral.create(type));
            } else {
                try {
                    keyValues.add(LiteralExpr.create(partitionKey, type));
                } catch (Exception ex) {
                    LOG.warn("Failed to create literal expression of type: " + type, ex);
                    throw new CatalogException("Invalid partition key value of type: " + type, ex);
                }
            }
        }
        try {
            Expr.analyze(keyValues, null);
        } catch (AnalysisException e) {
            // should never happen
            throw new IllegalStateException(e);
        }
    }
    try {
        // Each partition could reside on a different filesystem.
        FileSystem fs = partDirPath.getFileSystem(CONF);
        multipleFileSystems_ = multipleFileSystems_
                || !FileSystemUtil.isPathOnFileSystem(new Path(getLocation()), fs);
        if (fs.exists(partDirPath)) {
            // FileSystem does not have an API that takes in a timestamp and returns a list
            // of files that has been added/changed since. Therefore, we are calling
            // fs.listStatus() to list all the files.
            for (FileStatus fileStatus : fs.listStatus(partDirPath)) {
                String fileName = fileStatus.getPath().getName().toString();
                if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName)
                        || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) {
                    // Ignore directory, hidden file starting with . or _, and LZO index files
                    // If a directory is erroneously created as a subdirectory of a partition dir
                    // we should ignore it and move on. Hive will not recurse into directories.
                    // Skip index files, these are read by the LZO scanner directly.
                    continue;
                }

                String partitionDir = fileStatus.getPath().getParent().toString();
                FileDescriptor fd = null;
                // Search for a FileDescriptor with the same partition dir and file name. If one
                // is found, it will be chosen as a candidate to reuse.
                if (oldFileDescMap != null && oldFileDescMap.get(partitionDir) != null) {
                    for (FileDescriptor oldFileDesc : oldFileDescMap.get(partitionDir)) {
                        if (oldFileDesc.getFileName().equals(fileName)) {
                            fd = oldFileDesc;
                            break;
                        }
                    }
                }

                // Check if this FileDescriptor has been modified since last loading its block
                // location information. If it has not been changed, the previously loaded
                // value can be reused.
                if (fd == null || isMarkedCached || fd.getFileLength() != fileStatus.getLen()
                        || fd.getModificationTime() != fileStatus.getModificationTime()) {
                    // Create a new file descriptor, the block metadata will be populated by
                    // loadBlockMd.
                    fd = new FileDescriptor(fileName, fileStatus.getLen(), fileStatus.getModificationTime());
                    addPerFsFileDesc(perFsFileDescMap, fs, partitionDir, fd);
                }

                List<FileDescriptor> fds = fileDescMap_.get(partitionDir);
                if (fds == null) {
                    fds = Lists.newArrayList();
                    fileDescMap_.put(partitionDir, fds);
                }
                fds.add(fd);

                // Add to the list of FileDescriptors for this partition.
                fileDescriptors.add(fd);
            }
            numHdfsFiles_ += fileDescriptors.size();
        }
        HdfsPartition partition = new HdfsPartition(this, msPartition, keyValues, fileFormatDescriptor,
                fileDescriptors, getAvailableAccessLevel(fs, partDirPath));
        partition.checkWellFormed();
        return partition;
    } catch (Exception e) {
        throw new CatalogException("Failed to create partition: ", e);
    }
}

From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java

License:Apache License

/**
 * List file status by calling fileSystem.listStatus.
 *///from w  ww  .  j a v  a  2s.  c om
private static void listStatus(String dirPath) {
    Path path = new Path(dirPath);
    boolean exceptionThrown = false;
    try {
        FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf());
        FileStatus[] fileStatus = fs.listStatus(path);
        if (fs.exists(path)) {
            for (FileStatus status : fileStatus) {
                BlockLocation[] locations = fs.getFileBlockLocations(status, 0, status.getLen());
                for (BlockLocation loc : locations) {
                    loc.getNames();
                    loc.getHosts();
                }
            }
        }
    } catch (IOException e) {
        exceptionThrown = true;
        LOG.error("Failed to list Status", e);
    }
    assertFalse(exceptionThrown);
}

From source file:com.cloudera.impala.catalog.TestLoadHdfsMetadataPerf.java

License:Apache License

/**
 * List file status by calling fileSystem.listLocatedStatus.
 */// ww  w. j ava2  s  . c  om
private static void listLocatedStatus(String dirPath) {
    Path path = new Path(dirPath);
    boolean exceptionThrown = false;
    try {
        FileSystem fs = path.getFileSystem(LoadMetadataUtil.getConf());
        RemoteIterator<LocatedFileStatus> iterator = fs.listLocatedStatus(path);
        if (fs.exists(path)) {
            while (iterator.hasNext()) {
                LocatedFileStatus fileStatus = iterator.next();
                BlockLocation[] locations = fileStatus.getBlockLocations();
                for (BlockLocation loc : locations) {
                    loc.getHosts();
                    loc.getNames();
                }
            }
        }
    } catch (IOException e) {
        exceptionThrown = true;
        LOG.error("Failed to list Located Status", e);
    }
    assertFalse(exceptionThrown);
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Moves all visible (non-hidden) files from a source directory to a destination
 * directory. Any sub-directories within the source directory are skipped.
 * Returns the number of files moved as part of this operation.
 *//*ww  w .  j  ava 2 s .c o m*/
public static int moveAllVisibleFiles(Path sourceDir, Path destDir) throws IOException {
    FileSystem fs = destDir.getFileSystem(CONF);
    Preconditions.checkState(fs.isDirectory(destDir));
    Preconditions.checkState(fs.isDirectory(sourceDir));

    // Use the same UUID to resolve all file name conflicts. This helps mitigate problems
    // that might happen if there is a conflict moving a set of files that have
    // dependent file names. For example, foo.lzo and foo.lzo_index.
    UUID uuid = UUID.randomUUID();

    // Enumerate all the files in the source
    int numFilesMoved = 0;
    for (FileStatus fStatus : fs.listStatus(sourceDir)) {
        if (fStatus.isDirectory()) {
            LOG.debug("Skipping copy of directory: " + fStatus.getPath());
            continue;
        } else if (isHiddenFile(fStatus.getPath().getName())) {
            continue;
        }

        Path destFile = new Path(destDir, fStatus.getPath().getName());
        if (fs.exists(destFile)) {
            destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), uuid.toString()));
        }
        FileSystemUtil.moveFile(fStatus.getPath(), destFile, false);
        ++numFilesMoved;
    }
    return numFilesMoved;
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Moves (renames) the given file to a new location (either another directory or a
 * file. If renameIfAlreadyExists is true, no error will be thrown if a file with the
 * same name already exists in the destination location. Instead, a UUID will be
 * appended to the base file name, preserving the the existing file extension.
 * If renameIfAlreadyExists is false, an IOException will be thrown if there is a
 * file name conflict./*from w w  w  . j  a v  a2  s  .  c  o m*/
 */
public static void moveFile(Path sourceFile, Path dest, boolean renameIfAlreadyExists) throws IOException {
    FileSystem fs = dest.getFileSystem(CONF);

    Path destFile = fs.isDirectory(dest) ? new Path(dest, sourceFile.getName()) : dest;
    // If a file with the same name does not already exist in the destination location
    // then use the same file name. Otherwise, generate a unique file name.
    if (renameIfAlreadyExists && fs.exists(destFile)) {
        Path destDir = fs.isDirectory(dest) ? dest : dest.getParent();
        destFile = new Path(destDir, appendToBaseFileName(destFile.getName(), UUID.randomUUID().toString()));
    }
    LOG.debug(String.format("Moving '%s' to '%s'", sourceFile.toString(), destFile.toString()));
    // Move (rename) the file.
    fs.rename(sourceFile, destFile);
}

From source file:com.cloudera.nav.plugin.client.writer.MetadataWriterFactory.java

License:Apache License

private OutputStream createHdfsStream(PluginConfigurations config) {
    try {/*from   w ww .  j a  va2s .c  o  m*/
        FileSystem fs = FileSystem.get(config.getHadoopConfigurations());
        Path path = new Path(getFilePath(config.getMetadataParentUriString()));
        if (fs.exists(path)) {
            return fs.append(path);
        }
        // TODO block sizes, replication counts etc
        return fs.create(path);
    } catch (IOException e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.cloudera.nav.sdk.client.writer.MetadataWriterFactory.java

License:Apache License

private OutputStream createHdfsStream() {
    try {/*from w w w . jav  a  2 s . co  m*/
        FileSystem fs = FileSystem.get(config.getHadoopConfigurations());
        Path path = new Path(getFilePath(config.getMetadataParentUriString()));
        if (fs.exists(path)) {
            return fs.append(path);
        }
        // TODO block sizes, replication counts etc
        return fs.create(path);
    } catch (IOException e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

@Override
public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData,
        JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString,
        TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException {

    Objects.requireNonNull(newKeyMessageData);

    JavaRDD<M> newData = newKeyMessageData.values();
    JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

    if (newData != null) {
        newData.cache();/*w  ww  .  j  a  v a2  s. c om*/
        // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
        // when many workers try to materialize the RDDs at once. Hence the workaround.
        newData.foreachPartition(p -> {
        });
    }
    if (pastData != null) {
        pastData.cache();
        pastData.foreachPartition(p -> {
        });
    }

    List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues();
    int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates);
    List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates,
            valuesPerHyperParam);

    Path modelDir = new Path(modelDirString);
    Path tempModelPath = new Path(modelDir, ".temporary");
    Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

    FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
    fs.mkdirs(candidatesPath);

    Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos,
            candidatesPath);

    Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
    if (bestCandidatePath == null) {
        log.info("Unable to build any model");
    } else {
        // Move best model into place
        fs.rename(bestCandidatePath, finalPath);
    }
    // Then delete everything else
    fs.delete(candidatesPath, true);

    if (modelUpdateTopic == null) {
        log.info("No update topic configured, not publishing models to a topic");
    } else {
        // Push PMML model onto update topic, if it exists
        Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
        if (fs.exists(bestModelPath)) {
            FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
            PMML bestModel = null;
            boolean modelNeededForUpdates = canPublishAdditionalModelData();
            boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
            if (modelNeededForUpdates || modelNotTooLarge) {
                // Either the model is required for publishAdditionalModelData, or required because it's going to
                // be serialized to Kafka
                try (InputStream in = fs.open(bestModelPath)) {
                    bestModel = PMMLUtils.read(in);
                }
            }

            if (modelNotTooLarge) {
                modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
            } else {
                modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
            }

            if (modelNeededForUpdates) {
                publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath,
                        modelUpdateTopic);
            }
        }
    }

    if (newData != null) {
        newData.unpersist();
    }
    if (pastData != null) {
        pastData.unpersist();
    }
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

private Path findBestCandidatePath(JavaSparkContext sparkContext, JavaRDD<M> newData, JavaRDD<M> pastData,
        List<List<?>> hyperParameterCombos, Path candidatesPath) throws IOException {
    Map<Path, Double> pathToEval = ExecUtils.collectInParallel(candidates,
            Math.min(evalParallelism, candidates), true,
            i -> buildAndEval(i, hyperParameterCombos, sparkContext, newData, pastData, candidatesPath),
            Collectors.toMap(Pair::getFirst, Pair::getSecond));

    FileSystem fs = null;
    Path bestCandidatePath = null;
    double bestEval = Double.NEGATIVE_INFINITY;
    for (Map.Entry<Path, Double> pathEval : pathToEval.entrySet()) {
        Path path = pathEval.getKey();
        if (fs == null) {
            fs = FileSystem.get(path.toUri(), sparkContext.hadoopConfiguration());
        }/*from w  ww . ja  v a 2  s. c o m*/
        if (path != null && fs.exists(path)) {
            Double eval = pathEval.getValue();
            if (!Double.isNaN(eval)) {
                // Valid evaluation; if it's the best so far, keep it
                if (eval > bestEval) {
                    log.info("Best eval / model path is now {} / {}", eval, path);
                    bestEval = eval;
                    bestCandidatePath = path;
                }
            } else if (bestCandidatePath == null && testFraction == 0.0) {
                // Normal case when eval is disabled; no eval is possible, but keep the one model
                // that was built
                bestCandidatePath = path;
            }
        } // else can't do anything; no model at all
    }
    if (threshold != null && bestEval < threshold) {
        log.info("Best model at {} had eval {}, but did not exceed threshold {}; discarding model",
                bestCandidatePath, bestEval, threshold);
        bestCandidatePath = null;
    }
    return bestCandidatePath;
}

From source file:com.cloudera.recordbreaker.analyzer.DataQuery.java

License:Open Source License

String grabTable(DataDescriptor desc) throws SQLException, IOException {
    // Set up Hive table
    Path p = desc.getFilename();//from  ww  w.  j av  a 2s  .  co  m
    String tablename = tableCache.get(p);
    if (tablename == null) {
        tablename = "datatable" + Math.abs(r.nextInt());
        Statement stmt = hiveCon.createStatement();
        try {
            String creatTxt = desc.getHiveCreateTableStatement(tablename);
            LOG.info("Create: " + creatTxt);
            stmt.execute(creatTxt);
            tables.put(p, tablename);
        } finally {
            stmt.close();
        }

        // Copy avro version of data into secret location prior to Hive import
        FileSystem fs = FileSystem.get(conf);
        Path tmpTables = new Path(tmpTablesDir);
        if (!fs.exists(tmpTables)) {
            fs.mkdirs(tmpTables, new FsPermission("-rwxrwxrwx"));
        }
        Path secretDst = new Path(tmpTables, "r" + r.nextInt());
        LOG.info("Preparing Avro data at " + secretDst);
        desc.prepareAvroFile(fs, fs, secretDst, conf);
        fs.setPermission(secretDst, new FsPermission("-rwxrwxrwx"));

        // Import data
        stmt = hiveCon.createStatement();
        try {
            LOG.info("Import data into Hive: " + desc.getHiveImportDataStatement(tablename, secretDst));
            stmt.execute(desc.getHiveImportDataStatement(tablename, secretDst));
            isLoaded.add(p);
        } finally {
            stmt.close();
        }

        // Refresh impala metadata
        stmt = impalaCon.createStatement();
        try {
            try {
                LOG.info("Rebuilding Impala metadata...");
                stmt.execute("INVALIDATE METADATA");
            } catch (Exception iex) {
                LOG.info("Impala metadata rebuild failed: " + iex.toString());
            }
        } finally {
            stmt.close();
        }

        // Insert into table cache
        tableCache.put(p, tablename);
    }
    return tablename;
}