Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException

Source Link

Document

Return an array of FileStatus objects whose path names match pathPattern and is accepted by the user-supplied path filter.

Usage

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. /*from ww w.  j a  v  a  2  s  . c o  m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:hitune.analysis.mapreduce.processor.AnalysisProcessor.java

License:Apache License

/**
 * Move the TEMP output folder to final one(user defined one);
 * If there are multiple files under one job's output folder, it should merge the output into one file.
 * Then rename the folder to the final one.
 * @param job/*from   w w  w .  j ava 2s.  c  o m*/
 * @param output
 * @param result
 */
protected void moveResults(JobConf job, String output, String result) {
    try {
        FileSystem fs = FileSystem.get(job);
        log.debug("move results: " + result);
        Path src = new Path(result + "/" + "*.csv*");
        Path dst = new Path(output);
        if (!fs.exists(dst)) {
            fs.mkdirs(dst);
        }
        FileStatus[] matches = fs.globStatus(src, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;

            }
        });
        if (matches != null && matches.length != 0) {
            if (matches.length > 1) {
                //multiple output files
                String[] args = new String[2];
                args[0] = result;
                args[1] = "_" + result;
                fs.delete(new Path("_" + result));
                //merge multiple output files into one file
                ToolRunner.run(new MergeOutput(this.conf), args);
                fs.delete(new Path(result));
                fs.rename(new Path("_" + result), new Path(result));
            }

            matches = fs.globStatus(src, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    // TODO Auto-generated method stub
                    return true;
                }
            });

            for (FileStatus file : matches) {
                String filename = file.getPath().getName();
                filename = filename.substring(0, filename.indexOf("-"));
                log.debug("move file:" + filename);
                Path toFile = new Path(output + "/" + filename);
                if (fs.exists(toFile)) {
                    fs.delete(toFile);
                }
                fs.rename(file.getPath(), toFile);
                fs.delete(file.getPath().getParent(), true);
                FileStatus[] tmpDirs = fs.listStatus(file.getPath().getParent().getParent());
                if (tmpDirs == null || tmpDirs.length == 0) {
                    fs.delete(file.getPath().getParent().getParent(), true);
                }
                break;
            }
        } else {
            MOVE_DONE = false;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        MOVE_DONE = false;
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    MOVE_DONE = true;
}

From source file:hitune.analysis.mapreduce.processor.FileFilter.ChukwaFileFilter.java

License:Apache License

protected boolean inputValidation(Configuration job, String dir, PathFilter filter) {
    boolean result = false;
    if (filter == null) {
        filter = new PathFilter() {
            @Override//from  w w w .ja va2  s  .  c o  m
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;
            }

        };
    }
    Path[] p = StringUtils.stringToPath(new String[] { dir });
    try {
        FileSystem fs = p[0].getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p[0], filter);
        if (matches != null && matches.length != 0) {
            result = true;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return result;
}

From source file:hydrograph.engine.cascading.scheme.avro.CustomAvroScheme.java

License:Apache License

/**
 * This method peeks at the source data to get a schema when none has been
 * provided.//from  w  ww .  jav a  2s  . co  m
 *
 * @param flowProcess
 *            The cascading FlowProcess object for this flow.
 * @param tap
 *            The cascading Tap object.
 * @return Schema The schema of the peeked at data, or Schema.NULL if none
 *         exists.
 */
private Schema getSourceSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) throws IOException {

    if (tap instanceof CompositeTap) {
        tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
    }
    final String path = tap.getIdentifier();
    Path p = new Path(path);
    final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
    // Get all the input dirs
    List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter)));
    // Now get all the things that are one level down
    for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
        if (status.isDir())
            for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) {
                if (child.isDir()) {
                    statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter)));
                } else if (fs.isFile(child.getPath())) {
                    statuses.add(child);
                }
            }
    }
    for (FileStatus status : statuses) {
        Path statusPath = status.getPath();
        if (fs.isFile(statusPath)) {
            // no need to open them all
            InputStream stream = null;
            DataFileStream reader = null;
            try {
                stream = new BufferedInputStream(fs.open(statusPath));
                reader = new DataFileStream(stream, new GenericDatumReader());
                return reader.getSchema();
            } finally {
                if (reader == null) {
                    if (stream != null) {
                        stream.close();
                    }
                } else {
                    reader.close();
                }
            }

        }
    }
    // couldn't find any Avro files, return null schema
    return Schema.create(Schema.Type.NULL);
}

From source file:it.crs4.seal.read_sort.MergeAlignments.java

License:Open Source License

private Path[] getSourcePaths() throws Exception {
    Path srcPath = new Path(userInput);
    FileSystem srcFs = srcPath.getFileSystem(getConf());
    if (srcFs.exists(srcPath)) {
        FileStatus stat = srcFs.getFileStatus(srcPath);
        if (stat.isDir()) {
            String msg = "source path " + srcPath + " is a directory.  Globbing with ";
            srcPath = new Path(srcPath, "*");
            log.info(msg + srcPath);/*from   ww  w .  j  ava  2 s.c o m*/
        }
    }

    // Glob source path.  The returned paths are already sorted.  We filter out paths starting
    // with '_' (see SourcePathFilter).
    // If the path doesn't contain a glob patter, and it doesn't exist, the function will return null.
    Path[] sources = FileUtil.stat2Paths(srcFs.globStatus(srcPath, new SourcePathFilter()));
    if (sources == null)
        throw new IllegalArgumentException("Source path " + srcPath.makeQualified(srcFs) + " doesn't exist");

    if (log.isDebugEnabled()) {
        log.debug("Sources:");
        for (int i = 0; i < sources.length; ++i)
            log.debug(sources[i]);
    }

    if (sources.length == 0)
        throw new IllegalArgumentException("no source files selected");

    log.info("Merging " + sources.length + " files.");

    return sources;
}

From source file:ml.shifu.guagua.hadoop.GuaguaMRUnitDriver.java

License:Apache License

/**
 * List input directories.//  www.  j a va  2 s. co  m
 * Subclasses may override to, e.g., select only files matching a regular expression.
 * 
 * @param job
 *            the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException
 *             if zero items.
 * @throws InvalidInputException
 *             If any IOException for input files.
 */
protected List<FileStatus> listStatus(Configuration conf, String input) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(input);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new IOException(errors.toString());
    }
    return result;
}

From source file:ml.shifu.guagua.mapreduce.GuaguaMRUnitDriver.java

License:Apache License

/**
 * List input directories.//from  w ww .j  av a 2s  .c o  m
 * Subclasses may override to, e.g., select only files matching a regular expression.
 * 
 * @param job
 *            the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException
 *             if zero items or any IOException for input files.
 */
protected List<FileStatus> listStatus(Configuration conf, String input) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(input);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new IOException(errors.toString());
    }
    return result;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

@SuppressWarnings("deprecation")
protected List<FileStatus> listCrossValidationStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }//ww  w .jav a2 s .c o  m

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getConfiguration().getBoolean("mapreduce.input.fileinputformat.input.dir.recursive",
            false);

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    FileStatus[] fss = fs.listStatus(globStat.getPath());
                    for (FileStatus fileStatus : fss) {
                        if (inputFilter.accept(fileStatus.getPath())) {
                            if (recursive && fileStatus.isDir()) {
                                addInputPathRecursive(result, fs, fileStatus.getPath(), inputFilter);
                            } else {
                                result.add(fileStatus);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total validation paths to process : " + result.size());
    return result;
}

From source file:org.apache.falcon.catalog.CatalogPartitionHandler.java

License:Apache License

private void registerPartitions(Configuration conf, CatalogStorage storage, Path staticPath,
        List<String> staticPartition) throws FalconException {
    try {//from   ww w  .ja  v  a2  s.co  m
        FileSystem fs = HadoopClientFactory.get().createProxiedFileSystem(conf);
        if (!fs.exists(staticPath)) {
            //Do nothing if the output path doesn't exist
            return;
        }

        List<String> partitionColumns = getPartitionColumns(conf, storage);
        int dynamicPartCols = partitionColumns.size() - staticPartition.size();
        Path searchPath = staticPath;
        if (dynamicPartCols > 0) {
            searchPath = new Path(staticPath, StringUtils.repeat("*", "/", dynamicPartCols));
        }

        //Figure out the dynamic partitions from the directories on hdfs
        FileStatus[] files = fs.globStatus(searchPath, PATH_FILTER);
        Map<List<String>, String> partitions = new HashMap<List<String>, String>();
        for (FileStatus file : files) {
            List<String> dynamicParts = getDynamicPartitions(file.getPath(), staticPath);
            List<String> partitionValues = new ArrayList<String>(staticPartition);
            partitionValues.addAll(dynamicParts);
            LOG.debug("Final partition - " + partitionValues);
            partitions.put(partitionValues, file.getPath().toString());
        }

        List<List<String>> existPartitions = listPartitions(conf, storage, staticPartition);
        Collection<List<String>> targetPartitions = partitions.keySet();

        Collection<List<String>> partitionsForDrop = CollectionUtils.subtract(existPartitions,
                targetPartitions);
        Collection<List<String>> partitionsForAdd = CollectionUtils.subtract(targetPartitions, existPartitions);
        Collection<List<String>> partitionsForUpdate = CollectionUtils.intersection(existPartitions,
                targetPartitions);

        for (List<String> partition : partitionsForDrop) {
            dropPartitions(conf, storage, partition);
        }

        for (List<String> partition : partitionsForAdd) {
            addPartition(conf, storage, partition, partitions.get(partition));
        }

        for (List<String> partition : partitionsForUpdate) {
            updatePartition(conf, storage, partition, partitions.get(partition));
        }
    } catch (IOException e) {
        throw new FalconException(e);
    }
}

From source file:org.apache.giraph.io.formats.GiraphFileInputFormat.java

License:Apache License

/**
 * Common method for listing vertex/edge input directories.
 *
 * @param job The job/*  w ww. ja va2  s.c o  m*/
 * @param dirs list of vertex/edge input paths
 * @return Array of FileStatus objects
 * @throws IOException
 */
private List<FileStatus> listStatus(JobContext job, Path[] dirs) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    /*if[HADOOP_NON_SECURE]
    else[HADOOP_NON_SECURE]
        // get tokens for all the required FileSystems..
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs,
            job.getConfiguration());
    end[HADOOP_NON_SECURE]*/

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the HIDDEN_FILE_FILTER and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(HIDDEN_FILE_FILTER);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), inputFilter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}