Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException 

Source Link

Document

Return an array of FileStatus objects whose path names match pathPattern and is accepted by the user-supplied path filter.

Usage

From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java

License:Open Source License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. /*from ww w.  j a  v  a  2  s  . c o  m*/
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:hitune.analysis.mapreduce.processor.AnalysisProcessor.java

License:Apache License

/**
 * Move the TEMP output folder to final one(user defined one);
 * If there are multiple files under one job's output folder, it should merge the output into one file.
 * Then rename the folder to the final one.
 * @param job/*from   w w  w .  j ava 2s.  c  o m*/
 * @param output
 * @param result
 */
protected void moveResults(JobConf job, String output, String result) {
    try {
        FileSystem fs = FileSystem.get(job);
        log.debug("move results: " + result);
        Path src = new Path(result + "/" + "*.csv*");
        Path dst = new Path(output);
        if (!fs.exists(dst)) {
            fs.mkdirs(dst);
        }
        FileStatus[] matches = fs.globStatus(src, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;

            }
        });
        if (matches != null && matches.length != 0) {
            if (matches.length > 1) {
                //multiple output files
                String[] args = new String[2];
                args[0] = result;
                args[1] = "_" + result;
                fs.delete(new Path("_" + result));
                //merge multiple output files into one file
                ToolRunner.run(new MergeOutput(this.conf), args);
                fs.delete(new Path(result));
                fs.rename(new Path("_" + result), new Path(result));
            }

            matches = fs.globStatus(src, new PathFilter() {
                @Override
                public boolean accept(Path path) {
                    // TODO Auto-generated method stub
                    return true;
                }
            });

            for (FileStatus file : matches) {
                String filename = file.getPath().getName();
                filename = filename.substring(0, filename.indexOf("-"));
                log.debug("move file:" + filename);
                Path toFile = new Path(output + "/" + filename);
                if (fs.exists(toFile)) {
                    fs.delete(toFile);
                }
                fs.rename(file.getPath(), toFile);
                fs.delete(file.getPath().getParent(), true);
                FileStatus[] tmpDirs = fs.listStatus(file.getPath().getParent().getParent());
                if (tmpDirs == null || tmpDirs.length == 0) {
                    fs.delete(file.getPath().getParent().getParent(), true);
                }
                break;
            }
        } else {
            MOVE_DONE = false;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        MOVE_DONE = false;
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    MOVE_DONE = true;
}

From source file:hitune.analysis.mapreduce.processor.FileFilter.ChukwaFileFilter.java

License:Apache License

protected boolean inputValidation(Configuration job, String dir, PathFilter filter) {
    boolean result = false;
    if (filter == null) {
        filter = new PathFilter() {
            @Override//from  w w w .ja va2  s  .  c o  m
            public boolean accept(Path path) {
                // TODO Auto-generated method stub
                return true;
            }

        };
    }
    Path[] p = StringUtils.stringToPath(new String[] { dir });
    try {
        FileSystem fs = p[0].getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p[0], filter);
        if (matches != null && matches.length != 0) {
            result = true;
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return result;
}

From source file:hydrograph.engine.cascading.scheme.avro.CustomAvroScheme.java

License:Apache License

/**
 * This method peeks at the source data to get a schema when none has been
 * provided.//from  w  ww .  jav a  2s  . co  m
 *
 * @param flowProcess
 *            The cascading FlowProcess object for this flow.
 * @param tap
 *            The cascading Tap object.
 * @return Schema The schema of the peeked at data, or Schema.NULL if none
 *         exists.
 */
private Schema getSourceSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) throws IOException {

    if (tap instanceof CompositeTap) {
        tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
    }
    final String path = tap.getIdentifier();
    Path p = new Path(path);
    final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
    // Get all the input dirs
    List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter)));
    // Now get all the things that are one level down
    for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
        if (status.isDir())
            for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) {
                if (child.isDir()) {
                    statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter)));
                } else if (fs.isFile(child.getPath())) {
                    statuses.add(child);
                }
            }
    }
    for (FileStatus status : statuses) {
        Path statusPath = status.getPath();
        if (fs.isFile(statusPath)) {
            // no need to open them all
            InputStream stream = null;
            DataFileStream reader = null;
            try {
                stream = new BufferedInputStream(fs.open(statusPath));
                reader = new DataFileStream(stream, new GenericDatumReader());
                return reader.getSchema();
            } finally {
                if (reader == null) {
                    if (stream != null) {
                        stream.close();
                    }
                } else {
                    reader.close();
                }
            }

        }
    }
    // couldn't find any Avro files, return null schema
    return Schema.create(Schema.Type.NULL);
}

From source file:it.crs4.seal.read_sort.MergeAlignments.java

License:Open Source License

private Path[] getSourcePaths() throws Exception {
    Path srcPath = new Path(userInput);
    FileSystem srcFs = srcPath.getFileSystem(getConf());
    if (srcFs.exists(srcPath)) {
        FileStatus stat = srcFs.getFileStatus(srcPath);
        if (stat.isDir()) {
            String msg = "source path " + srcPath + " is a directory.  Globbing with ";
            srcPath = new Path(srcPath, "*");
            log.info(msg + srcPath);/*from   ww  w .  j  ava  2 s.c o m*/
        }
    }

    // Glob source path.  The returned paths are already sorted.  We filter out paths starting
    // with '_' (see SourcePathFilter).
    // If the path doesn't contain a glob patter, and it doesn't exist, the function will return null.
    Path[] sources = FileUtil.stat2Paths(srcFs.globStatus(srcPath, new SourcePathFilter()));
    if (sources == null)
        throw new IllegalArgumentException("Source path " + srcPath.makeQualified(srcFs) + " doesn't exist");

    if (log.isDebugEnabled()) {
        log.debug("Sources:");
        for (int i = 0; i < sources.length; ++i)
            log.debug(sources[i]);
    }

    if (sources.length == 0)
        throw new IllegalArgumentException("no source files selected");

    log.info("Merging " + sources.length + " files.");

    return sources;
}

From source file:ml.shifu.guagua.hadoop.GuaguaMRUnitDriver.java

License:Apache License

/**
 * List input directories.//  www.  j a va  2 s. co  m
 * Subclasses may override to, e.g., select only files matching a regular expression.
 * 
 * @param job
 *            the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException
 *             if zero items.
 * @throws InvalidInputException
 *             If any IOException for input files.
 */
protected List<FileStatus> listStatus(Configuration conf, String input) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(input);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new IOException(errors.toString());
    }
    return result;
}

From source file:ml.shifu.guagua.mapreduce.GuaguaMRUnitDriver.java

License:Apache License

/**
 * List input directories.//from  w ww .j  av a 2s  .c o  m
 * Subclasses may override to, e.g., select only files matching a regular expression.
 * 
 * @param job
 *            the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException
 *             if zero items or any IOException for input files.
 */
protected List<FileStatus> listStatus(Configuration conf, String input) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(input);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new IOException(errors.toString());
    }
    return result;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

@SuppressWarnings("deprecation")
protected List<FileStatus> listCrossValidationStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }//ww  w .jav a2 s .c o  m

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getConfiguration().getBoolean("mapreduce.input.fileinputformat.input.dir.recursive",
            false);

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    FileStatus[] fss = fs.listStatus(globStat.getPath());
                    for (FileStatus fileStatus : fss) {
                        if (inputFilter.accept(fileStatus.getPath())) {
                            if (recursive && fileStatus.isDir()) {
                                addInputPathRecursive(result, fs, fileStatus.getPath(), inputFilter);
                            } else {
                                result.add(fileStatus);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total validation paths to process : " + result.size());
    return result;
}

From source file:org.apache.falcon.catalog.CatalogPartitionHandler.java

License:Apache License

private void registerPartitions(Configuration conf, CatalogStorage storage, Path staticPath,
        List<String> staticPartition) throws FalconException {
    try {//from   ww w  .ja  v  a2  s.co  m
        FileSystem fs = HadoopClientFactory.get().createProxiedFileSystem(conf);
        if (!fs.exists(staticPath)) {
            //Do nothing if the output path doesn't exist
            return;
        }

        List<String> partitionColumns = getPartitionColumns(conf, storage);
        int dynamicPartCols = partitionColumns.size() - staticPartition.size();
        Path searchPath = staticPath;
        if (dynamicPartCols > 0) {
            searchPath = new Path(staticPath, StringUtils.repeat("*", "/", dynamicPartCols));
        }

        //Figure out the dynamic partitions from the directories on hdfs
        FileStatus[] files = fs.globStatus(searchPath, PATH_FILTER);
        Map<List<String>, String> partitions = new HashMap<List<String>, String>();
        for (FileStatus file : files) {
            List<String> dynamicParts = getDynamicPartitions(file.getPath(), staticPath);
            List<String> partitionValues = new ArrayList<String>(staticPartition);
            partitionValues.addAll(dynamicParts);
            LOG.debug("Final partition - " + partitionValues);
            partitions.put(partitionValues, file.getPath().toString());
        }

        List<List<String>> existPartitions = listPartitions(conf, storage, staticPartition);
        Collection<List<String>> targetPartitions = partitions.keySet();

        Collection<List<String>> partitionsForDrop = CollectionUtils.subtract(existPartitions,
                targetPartitions);
        Collection<List<String>> partitionsForAdd = CollectionUtils.subtract(targetPartitions, existPartitions);
        Collection<List<String>> partitionsForUpdate = CollectionUtils.intersection(existPartitions,
                targetPartitions);

        for (List<String> partition : partitionsForDrop) {
            dropPartitions(conf, storage, partition);
        }

        for (List<String> partition : partitionsForAdd) {
            addPartition(conf, storage, partition, partitions.get(partition));
        }

        for (List<String> partition : partitionsForUpdate) {
            updatePartition(conf, storage, partition, partitions.get(partition));
        }
    } catch (IOException e) {
        throw new FalconException(e);
    }
}

From source file:org.apache.giraph.io.formats.GiraphFileInputFormat.java

License:Apache License

/**
 * Common method for listing vertex/edge input directories.
 *
 * @param job The job/*  w ww. ja va2  s.c o  m*/
 * @param dirs list of vertex/edge input paths
 * @return Array of FileStatus objects
 * @throws IOException
 */
private List<FileStatus> listStatus(JobContext job, Path[] dirs) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    /*if[HADOOP_NON_SECURE]
    else[HADOOP_NON_SECURE]
        // get tokens for all the required FileSystems..
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs,
            job.getConfiguration());
    end[HADOOP_NON_SECURE]*/

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the HIDDEN_FILE_FILTER and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(HIDDEN_FILE_FILTER);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), inputFilter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}