Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
 * memory and will run at the speed of your disk read
 * /*  w ww  . j  ava  2  s .  c o  m*/
 * @param featureCountPath
 * @param dictionaryPathBase
 * @throws IOException
 */
private static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase,
        int chunkSizeInMegabytes) throws IOException {
    List<Path> chunkPaths = new ArrayList<Path>();

    IntWritable key = new IntWritable();
    LongWritable value = new LongWritable();
    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
    FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath, OUTPUT_FILES_PATTERN));

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
            LongWritable.class);

    long currentChunkSize = 0;
    long featureCount = 0;
    long vectorCount = Long.MAX_VALUE;
    for (FileStatus fileStatus : outputFiles) {
        Path path = fileStatus.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        // key is feature value is count
        while (reader.next(key, value)) {
            if (currentChunkSize > chunkSizeLimit) {
                freqWriter.close();
                chunkIndex++;

                chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
                chunkPaths.add(chunkPath);

                freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
                        LongWritable.class);
                currentChunkSize = 0;
            }

            int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
            currentChunkSize += fieldSize;
            if (key.get() >= 0) {
                freqWriter.append(key, value);
            } else if (key.get() == -1) {
                vectorCount = value.get();
            }
            featureCount = Math.max(key.get(), featureCount);

        }
    }
    featureCount++;
    freqWriter.close();
    Long[] counts = { featureCount, vectorCount };
    return new Pair<Long[], List<Path>>(counts, chunkPaths);
}

From source file:org.apache.mahout.utils.vectors.VectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//from w  w w  .j  ava2 s .c o m
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
     "The Sequence File containing the Vectors").withShortName("s").create();
     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
     .withDescription("The directory containing Sequence File of Vectors")
     .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            //TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    //we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:org.apache.nifi.processors.hadoop.DeleteHDFS.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile originalFlowFile = session.get();

    // If this processor has an incoming connection, then do not run unless a
    // FlowFile is actually sent through
    if (originalFlowFile == null && context.hasIncomingConnection()) {
        context.yield();/*from   ww  w  .  j av  a2 s .  c  om*/
        return;
    }

    // We need a FlowFile to report provenance correctly.
    FlowFile flowFile = originalFlowFile != null ? originalFlowFile : session.create();

    final String fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY)
            .evaluateAttributeExpressions(flowFile).getValue();

    final FileSystem fileSystem = getFileSystem();
    try {
        // Check if the user has supplied a file or directory pattern
        List<Path> pathList = Lists.newArrayList();
        if (GLOB_MATCHER.reset(fileOrDirectoryName).find()) {
            FileStatus[] fileStatuses = fileSystem.globStatus(new Path(fileOrDirectoryName));
            if (fileStatuses != null) {
                for (FileStatus fileStatus : fileStatuses) {
                    pathList.add(fileStatus.getPath());
                }
            }
        } else {
            pathList.add(new Path(fileOrDirectoryName));
        }

        int failedPath = 0;
        for (Path path : pathList) {
            if (fileSystem.exists(path)) {
                try {
                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(2);
                    attributes.put("hdfs.filename", path.getName());
                    attributes.put("hdfs.path", path.getParent().toString());
                    flowFile = session.putAllAttributes(flowFile, attributes);

                    fileSystem.delete(path, context.getProperty(RECURSIVE).asBoolean());
                    getLogger().debug("For flowfile {} Deleted file at path {} with name {}",
                            new Object[] { originalFlowFile, path.getParent().toString(), path.getName() });
                    final Path qualifiedPath = path.makeQualified(fileSystem.getUri(),
                            fileSystem.getWorkingDirectory());
                    session.getProvenanceReporter().invokeRemoteProcess(flowFile, qualifiedPath.toString());
                } catch (IOException ioe) {
                    // One possible scenario is that the IOException is permissions based, however it would be impractical to check every possible
                    // external HDFS authorization tool (Ranger, Sentry, etc). Local ACLs could be checked but the operation would be expensive.
                    getLogger().warn("Failed to delete file or directory", ioe);

                    Map<String, String> attributes = Maps.newHashMapWithExpectedSize(1);
                    // The error message is helpful in understanding at a flowfile level what caused the IOException (which ACL is denying the operation, e.g.)
                    attributes.put("hdfs.error.message", ioe.getMessage());

                    session.transfer(session.putAllAttributes(session.clone(flowFile), attributes),
                            REL_FAILURE);
                    failedPath++;
                }
            }
        }

        if (failedPath == 0) {
            session.transfer(flowFile, DeleteHDFS.REL_SUCCESS);
        } else {
            // If any path has been failed to be deleted, remove the FlowFile as it's been cloned and sent to failure.
            session.remove(flowFile);
        }
    } catch (IOException e) {
        getLogger().error("Error processing delete for flowfile {} due to {}",
                new Object[] { flowFile, e.getMessage() }, e);
        session.transfer(flowFile, DeleteHDFS.REL_FAILURE);
    }

}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chgrp(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String user, String group,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("user", user);
    argsMap.put("group", group);
    try {/*w w  w. j  a v a 2 s  .  com*/
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chgrp" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chgrp", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Delete path/*from w ww  .j a va  2 s.  co m*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param path
 * @throws ActionExecutorException
 */
public void delete(Context context, XConfiguration fsConf, Path nameNodePath, Path path, boolean skipTrash)
        throws ActionExecutorException {
    URI uri = path.toUri();
    URIHandler handler;
    try {
        handler = Services.get().get(URIHandlerService.class).getURIHandler(uri);
        if (handler instanceof FSURIHandler) {
            // Use legacy code to handle hdfs partition deletion
            path = resolveToFullPath(nameNodePath, path, true);
            final FileSystem fs = getFileSystemFor(path, context, fsConf);
            Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
            if (pathArr != null && pathArr.length > 0) {
                checkGlobMax(pathArr);
                for (final Path p : pathArr) {
                    if (fs.exists(p)) {
                        if (!skipTrash) {
                            // Moving directory/file to trash of user.
                            UserGroupInformationService ugiService = Services.get()
                                    .get(UserGroupInformationService.class);
                            UserGroupInformation ugi = ugiService
                                    .getProxyUser(fs.getConf().get(OozieClient.USER_NAME));
                            ugi.doAs(new PrivilegedExceptionAction<FileSystem>() {
                                @Override
                                public FileSystem run() throws Exception {
                                    Trash trash = new Trash(fs.getConf());
                                    if (!trash.moveToTrash(p)) {
                                        throw new ActionExecutorException(
                                                ActionExecutorException.ErrorType.ERROR, "FS005",
                                                "Could not move path [{0}] to trash on delete", p);
                                    }
                                    return null;
                                }
                            });
                        } else if (!fs.delete(p, true)) {
                            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS005",
                                    "delete, path [{0}] could not delete path", p);
                        }
                    }
                }
            }
        } else {
            handler.delete(uri, handler.getContext(uri, fsConf, context.getWorkflow().getUser(), false));
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

/**
 * Move source to target/*from  www  .  java  2  s .c  o m*/
 *
 * @param context
 * @param fsConf
 * @param nameNodePath
 * @param source
 * @param target
 * @param recovery
 * @throws ActionExecutorException
 */
public void move(Context context, XConfiguration fsConf, Path nameNodePath, Path source, Path target,
        boolean recovery) throws ActionExecutorException {
    try {
        source = resolveToFullPath(nameNodePath, source, true);
        validateSameNN(source, target);
        FileSystem fs = getFileSystemFor(source, context, fsConf);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(source));
        if ((pathArr == null || pathArr.length == 0)) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS006",
                        "move, source path [{0}] does not exist", source);
            } else {
                return;
            }
        }
        if (pathArr.length > 1 && (!fs.exists(target) || fs.isFile(target))) {
            if (!recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS012",
                        "move, could not rename multiple sources to the same target name");
            } else {
                return;
            }
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            if (!fs.rename(p, target) && !recovery) {
                throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS008",
                        "move, could not move [{0}] to [{1}]", p, target);
            }
        }
    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FsActionExecutor.java

License:Apache License

void chmod(Context context, XConfiguration fsConf, Path nameNodePath, Path path, String permissions,
        boolean dirFiles, boolean recursive) throws ActionExecutorException {

    HashMap<String, String> argsMap = new HashMap<String, String>();
    argsMap.put("permissions", permissions);
    try {/*from  ww w. j av a  2s.c  o  m*/
        FileSystem fs = getFileSystemFor(path, context, fsConf);
        path = resolveToFullPath(nameNodePath, path, true);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(path));
        if (pathArr == null || pathArr.length == 0) {
            throw new ActionExecutorException(ActionExecutorException.ErrorType.ERROR, "FS009",
                    "chmod" + ", path(s) that matches [{0}] does not exist", path);
        }
        checkGlobMax(pathArr);
        for (Path p : pathArr) {
            recursiveFsOperation("chmod", fs, nameNodePath, p, argsMap, dirFiles, recursive, true);
        }

    } catch (Exception ex) {
        throw convertException(ex);
    }
}

From source file:org.apache.oozie.action.hadoop.FSLauncherURIHandler.java

License:Apache License

@Override
public boolean delete(URI uri, Configuration conf) throws LauncherException {
    boolean status = false;
    try {// ww  w .  j  a  va  2  s  . co  m
        FileSystem fs = FileSystem.get(uri, conf);
        Path[] pathArr = FileUtil.stat2Paths(fs.globStatus(getNormalizedPath(uri)));
        if (pathArr != null && pathArr.length > 0) {
            int fsGlobMax = conf.getInt(LauncherMapper.CONF_OOZIE_ACTION_FS_GLOB_MAX, 1000);
            if (pathArr.length > fsGlobMax) {
                throw new LauncherException(
                        "exceeds max number (" + fsGlobMax + ") of files/dirs to delete in <prepare>");
            }
            for (Path path : pathArr) {
                if (fs.exists(path)) {
                    status = fs.delete(path, true);
                    if (status) {
                        System.out.println("Deletion of path " + path + " succeeded.");
                    } else {
                        System.out.println("Deletion of path " + path + " failed.");
                    }
                }
            }
        }
    } catch (IOException e) {
        throw new LauncherException("Deletion of path " + uri + " failed.", e);
    }
    return status;
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator.java

License:Apache License

/**
 * Get the input size for as many inputs as possible. Inputs that do not report
 * their size nor can pig look that up itself are excluded from this size.
 *//*from   ww w  .j  a v a 2  s  .com*/
static long getTotalInputFileSize(Configuration conf, List<POLoad> lds, Job job) throws IOException {
    long totalInputFileSize = 0;
    for (POLoad ld : lds) {
        long size = getInputSizeFromLoader(ld, job);
        if (size > -1) {
            totalInputFileSize += size;
            continue;
        } else {

            // the input file location might be a list of comma separated files,
            // separate them out
            for (String location : LoadFunc.getPathStrings(ld.getLFile().getFileName())) {
                if (UriUtil.isHDFSFileOrLocalOrS3N(location, conf)) {
                    Path path = new Path(location);
                    FileSystem fs = path.getFileSystem(conf);
                    FileStatus[] status = fs.globStatus(path);
                    if (status != null) {
                        for (FileStatus s : status) {
                            totalInputFileSize += MapRedUtil.getPathLength(fs, s);
                        }
                    }
                } else {
                    // If we cannot estimate size of a location, we should report -1
                    return -1;
                }
            }
        }
    }
    return totalInputFileSize;
}

From source file:org.apache.pig.builtin.AvroStorage.java

License:Apache License

/**
 * Reads the avro schemas at the specified location.
 * @param p Location of file//from   w  w  w.  j ava 2 s .  co m
 * @param job Hadoop job object
 * @return an Avro Schema object derived from the specified file
 * @throws IOException
 *
 */
public Schema getAvroSchema(final Path[] p, final Job job) throws IOException {
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
    FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration());
    for (Path temp : p) {
        for (FileStatus tempf : fs.globStatus(temp)) {
            statusList.add(tempf);
        }
    }
    FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]);

    if (statusArray == null) {
        throw new IOException("Path " + p.toString() + " does not exist.");
    }

    if (statusArray.length == 0) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    Path filePath = Utils.depthFirstSearchForFile(statusArray, fs);

    if (filePath == null) {
        throw new IOException("No path matches pattern " + p.toString());
    }

    InputStream hdfsInputStream = fs.open(filePath);
    DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader);
    Schema s = avroDataStream.getSchema();
    avroDataStream.close();
    return s;
}