Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
 * memory and will run at the speed of your disk read
 * /*  w ww  . j  ava  2  s .  c o  m*/
 * @param featureCountPath
 * @param dictionaryPathBase
 * @throws IOException
 */
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = new ArrayList<Path>();

    IntWritable key = new IntWritable();
    LongWritable value = new LongWritable();
    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath, OUTPUT_FILES_PATTERN));

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    long currentChunkSize = 0;
    long featureCount = 0;
    long vectorCount = Long.MAX_VALUE;
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (currentChunkSize > chunkSizeLimit) {
                freqWriter.close();
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
    }
    featureCount++;
    freqWriter.close();
    Long[] counts = { featureCount, vectorCount };
    return new Pair<Long[], List<Path>>(counts, chunkPaths);
}

From source file:org.apache.mahout.utils.vectors.VectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//from w  w w  .j  ava2 s .c o m
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
     "The Sequence File containing the Vectors").withShortName("s").create();
     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
     .withDescription("The directory containing Sequence File of Vectors")
     .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            //TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    //we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:org.apache.nifi.processors.hadoop.DeleteHDFS.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile originalFlowFile = session.get();

    // If this processor has an incoming connection, then do not run unless a
    // FlowFile is actually sent through
    if (originalFlowFile == null && context.hasIncomingConnection()) {
        context.yield();/*from   ww  w  .  j av  a2 s .  c  om*/
        return;
    }

    // We need a FlowFile to report provenance correctly.
    FlowFile flowFile = originalFlowFile != null ? originalFlowFile : session.create();

    final String fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY)
            .evaluateAttributeExpressions(flowFile).getValue();

    final FileSystem fileSystem = getFileSystem();
    try {
        // Check if the user has supplied a file or directory pattern
        List<Path> pathList = Lists.newArrayList();
        if (GLOB_MATCHER.reset(fileOrDirectoryName).find()) {
            FileStatus[] fileStatuses = fileSystem.globStatus(new Path(fileOrDirectoryName));
            if (fileStatuses != null) {
                for (FileStatus fileStatus : fileStatuses) {
                    pathList.add(fileStatus.getPath());
                }
            }
        } else {
            pathList.add(new Path(fileOrDirectoryName));
        }

        int failedPath = 0;
        for (Path path : pathList) {
            if (fileSystem.exists(path)) {
                try {
                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(2);
                    attributes.put("hdfs.filename", path.getName());
                    attributes.put("hdfs.path", path.getParent().toString());
                    flowFile = session.putAllAttributes(flowFile, attributes);

                    fileSystem.delete(path, context.getProperty(RECURSIVE).asBoolean());
                    getLogger().debug("For flowfile {} Deleted file at path {} with name {}",
                            new Object[] { originalFlowFile, path.getParent().toString(), path.getName() });
                    final Path qualifiedPath = path.makeQualified(fileSystem.getUri(),
                            fileSystem.getWorkingDirectory());
                    session.getProvenanceReporter().invokeRemoteProcess(flowFile, qualifiedPath.toString());
                } catch (IOException ioe) {
                    // One possible scenario is that the IOException is permissions based, however it would be impractical to check every possible
                    // external HDFS authorization tool (Ranger, Sentry, etc). Local ACLs could be checked but the operation would be expensive.
                    getLogger().warn("Failed to delete file or directory", ioe);

                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(1);
                    // The error message is helpful in understanding at a flowfile level what caused the IOException (which ACL is denying the operation, e.g.)
                    attributes.put("hdfs.error.message", ioe.getMessage());

                    session.transfer(session.putAllAttributes(session.clone(flowFile), attributes),
                            REL_FAILURE);
                    failedPath++;
                }
            }
        }

        if (failedPath == 0) {
            session.transfer(flowFile, DeleteHDFS.REL_SUCCESS);
        } else {
            // If any path has been failed to be deleted, remove the FlowFile as it's been cloned and sent to failure.
            session.remove(flowFile);
        }
    } catch (IOException e) {
        getLogger().error("Error processing delete for flowfile {} due to {}",
                new Object[] { flowFile, e.getMessage() }, e);
        session.transfer(flowFile, DeleteHDFS.REL_FAILURE);
    }

}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chgrp(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String user, String group,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("user", user);
    argsMap.put("group", group);
    try {/*w w  w. j  a v a 2 s  .  com*/
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chgrp" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chgrp", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Delete path/*from w ww  .j a va  2 s.  co m*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param path
 * @throws ActionExecutorException
 */
public void delete(Context context, XConfiguration fsConf, Path nameNodePath, Path path, boolean skipTrash)
        throws ActionExecutorException {
    URI uri = path.toUri();
    URIHandler handler;
    try {
        handler = Services.get().get(URIHandlerService.class).getURIHandler(uri);
        if (handler instanceof FSURIHandler) {
            // Use legacy code to handle hdfs partition deletion
            path = resolveToFullPath(nameNodePath, path, true);
            final FileSystem fs = getFileSystemFor(path, context, fsConf);
            Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
            if (pathArr != null && pathArr.length > 0) {
                checkGlobMax(pathArr);
                for (final Path p : pathArr) {
                    if (fs.exists(p)) {
                        if (!skipTrash) {
                            // Moving directory/file to trash of user.
                            UserGroupInformationService ugiService = Services.get()
                                    .get(UserGroupInformationService.class);
                            UserGroupInformation ugi = ugiService
                                    .getProxyUser(fs.getConf().get(OozieClient.USER_NAME));
                            ugi.doAs(new PrivilegedExceptionAction<FileSystem>() {
                                @Override
                                public FileSystem run() throws Exception {
                                    Trash trash = new Trash(fs.getConf());
                                    if (!trash.moveToTrash(p)) {
                                        throw new ActionExecutorException(
                                                ActionExecutorException.ErrorType.ERROR, "FS005",
                                                "Could not move path [{0}] to trash on delete", p);
                                    }
                                    return null;
                                }
                            });
                        } else if (!fs.delete(p, true)) {
                            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS005",
                                    "delete, path [{0}] could not delete path", p);
                        }
                    }
                }
            }
        } else {
            handler.delete(uri, handler.getContext(uri, fsConf, context.getWorkflow().getUser(), false));
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Move source to target/*from  www  .  java  2  s .c  o m*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param source
 * @param target
 * @param recovery
 * @throws ActionExecutorException
 */
public void move(Context context, XConfiguration fsConf, Path nameNodePath, Path source, Path target,
        boolean recovery) throws ActionExecutorException {
    try {
        source = resolveToFullPath(nameNodePath, source, true);
        validateSameNN(source, target);
        FileSystem fs = getFileSystemFor(source, context, fsConf);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(source));
        if ((pathArr == null || pathArr.length == 0)) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS006",
                        "move, source path [{0}] does not exist", source);
            } else {
                return;
            }
        }
        if (pathArr.length > 1 && (!fs.exists(target) || fs.isFile(target))) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS012",
                        "move, could not rename multiple sources to the same target name");
            } else {
                return;
            }
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            if (!fs.rename(p, target) && !recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS008",
                        "move, could not move [{0}] to [{1}]", p, target);
            }
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chmod(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String permissions,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("permissions", permissions);
    try {/*from  ww w. j av a  2s.c  o  m*/
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chmod" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chmod", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }

    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FSLauncherURIHandler.java

License:Apache License

@Override
public boolean delete(URI uri, Configuration conf) throws LauncherException {
    boolean status = false;
    try {// ww  w .  j  a  va  2  s  . co  m
        FileSystem fs = FileSystem.get(uri, conf);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(getNormalizedPath(uri)));
        if (pathArr != null && pathArr.length > 0) {
            int fsGlobMax = conf.getInt(LauncherMapper.CONF_OOZIE_ACTION_FS_GLOB_MAX, 1000);
            if (pathArr.length > fsGlobMax) {
                throw new LauncherException(
                        "exceeds max number (" + fsGlobMax + ") of files/dirs to delete in <prepare>");
            }
            for (Path path : pathArr) {
                if (fs.exists(path)) {
                    status = fs.delete(path, true);
                    if (status) {
                        System.out.println("Deletion of path " + path + " succeeded.");
                    } else {
                        System.out.println("Deletion of path " + path + " failed.");
                    }
                }
            }
        }
    } catch (IOException e) {
        throw new LauncherException("Deletion of path " + uri + " failed.", e);
    }
    return status;
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator.java

License:Apache License

/**
 * Get the input size for as many inputs as possible. Inputs that do not report
 * their size nor can pig look that up itself are excluded from this size.
 *//*from   ww w  .j  a v a 2  s  .com*/
static long getTotalInputFileSize(Configuration conf, List<POLoad> lds, Job job) throws IOException {
    long totalInputFileSize = 0;
    for (POLoad ld : lds) {
        long size = getInputSizeFromLoader(ld, job);
        if (size > -1) {
            totalInputFileSize += size;
            continue;
        } else {

            // the input file location might be a list of comma separated files,
            // separate them out
            for (String location : LoadFunc.getPathStrings(ld.getLFile().getFileName())) {
                if (UriUtil.isHDFSFileOrLocalOrS3N(location, conf)) {
                    Path path = new Path(location);
                    FileSystem fs = path.getFileSystem(conf);
                    FileStatus[] status = fs.globStatus(path);
                    if (status != null) {
                        for (FileStatus s : status) {
                            totalInputFileSize += MapRedUtil.getPathLength(fs, s);
                        }
                    }
                } else {
                    // If we cannot estimate size of a location, we should report -1
                    return -1;
                }
            }
        }
    }
    return totalInputFileSize;
}

From source file:org.apache.pig.builtin.AvroStorage.java

License:Apache License

/**
 * Reads the avro schemas at the specified location.
 * @param p Location of file//from   w  w  w.  j ava 2 s .  co m
 * @param job Hadoop job object
 * @return an Avro Schema object derived from the specified file
 * @throws IOException
 *
 */
public Schema getAvroSchema(final Path[] p, final Job job) throws IOException {
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
    FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration());
    for (Path temp : p) {
        for (FileStatus tempf : fs.globStatus(temp)) {
            statusList.add(tempf);
        }
    }
    FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]);

    if (statusArray == null) {
        throw new IOException("Path " + p.toString() + " does not exist.");
    }

    if (statusArray.length == 0) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    Path filePath = Utils.depthFirstSearchForFile(statusArray, fs);

    if (filePath == null) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    InputStream hdfsInputStream = fs.open(filePath);
    DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader);
    Schema s = avroDataStream.getSchema();
    avroDataStream.close();
    return s;
}