Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus


In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.


public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link


Return all the files that match filePattern and are not checksum files.


From source file:com.linkedin.cubert.utils.CommonUtils.java

License:Open Source License

public static Path getAFileInPath(Configuration conf, Path path, String suffix) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.getFileStatus(path).isDir()) {
        Path globPath = new Path(path, "*." + suffix);
        FileStatus[] allFiles = fs.globStatus(globPath);
        if (allFiles.length == 0) {
            throw new IOException("there are no files in " + path.toString());
        }//from   ww  w.  jav  a 2 s . co m

        path = allFiles[0].getPath();

    print.f("Obtaining schema of %s file %s", suffix, path.toString());

    return path;

From source file:com.linkedin.cubert.utils.FileSystemUtils.java

License:Open Source License

public static List<Path> getGlobPaths(FileSystem fs, Path path) throws IOException {
    List<Path> paths = new ArrayList<Path>();

    FileStatus[] fileStatus = fs.globStatus(path);

    if (fileStatus == null)
        throw new IOException("Cannot determine paths at " + path.toString());

    for (FileStatus status : fileStatus) {
        paths.add(status.getPath());/*  www.ja v  a2  s  .  co  m*/

    return paths;

From source file:com.linkedin.cubert.utils.FileSystemUtils.java

License:Open Source License

public static Path getLatestPath(FileSystem fs, Path path) throws IOException {
    String pathStr = path.toString();

    // Return the same path, if there is no "#LATEST" within it
    if (!pathStr.contains("#LATEST"))
        return path;

    // replace all #LATEST with glob "*"
    pathStr = pathStr.replaceAll("#LATEST", "*");

    FileStatus[] fileStatus = fs.globStatus(new Path(pathStr));

    if (fileStatus == null || fileStatus.length == 0)
        throw new IOException("Cannot determine paths at " + pathStr);

    String latestPath = null;/*from  www  . j av  a 2 s .c om*/
    for (FileStatus status : fileStatus) {
        String thisPath = status.getPath().toString();
        if (latestPath == null || thisPath.compareTo(latestPath) > 0)
            latestPath = thisPath;

    return new Path(latestPath);

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

public void run() throws Exception {
    LOGGER.info("Starting {}", getClass().getSimpleName());

    FileSystem fs = FileSystem.get(getConf());
    Path inputPathPattern = new Path(_inputSegmentDir);

    if (fs.exists(new Path(_stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(_stagingDir), true);
    }/*from  w ww  . ja v  a  2 s  .  co m*/
    fs.mkdirs(new Path(_stagingDir));
    fs.mkdirs(new Path(_stagingDir + "/input/"));

    if (fs.exists(new Path(_outputDir))) {
        LOGGER.warn("Found the output folder, deleting it");
        fs.delete(new Path(_outputDir), true);
    fs.mkdirs(new Path(_outputDir));

    List<FileStatus> inputDataFiles = new ArrayList<FileStatus>();
    FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern);
    for (FileStatus fileStatus : fileStatusArr) {
        inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath()));

    for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
        FileStatus file = inputDataFiles.get(seqId);
        String completeFilePath = " " + file.getPath().toString() + " " + seqId;
        Path newOutPutFile = new Path((_stagingDir + "/input/"
                + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt"));
        FSDataOutputStream stream = fs.create(newOutPutFile);

    Job job = Job.getInstance(getConf());



    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {



    FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema));

    for (Object key : _properties.keySet()) {
        job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString()));

    if (_depsJarPath != null && _depsJarPath.length() > 0) {
        addDepsJarToDistributedCache(new Path(_depsJarPath), job);

    // Submit the job for execution.
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);

    LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName()));

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", _stagingDir);
    fs.delete(new Path(_stagingDir), true);

From source file:com.linkedin.pinot.hadoop.job.SegmentTarPushJob.java

License:Apache License

public void run() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path path = new Path(_segmentPath);
    FileStatus[] fileStatusArr = fs.globStatus(path);
    for (FileStatus fileStatus : fileStatusArr) {
        if (fileStatus.isDirectory()) {
            pushDir(fs, fileStatus.getPath());
        } else {// www .  j  a  v a2  s .  co m
            pushOneTarFile(fs, fileStatus.getPath());


From source file:com.linkedin.thirdeye.hadoop.push.SegmentPushPhase.java

License:Apache License

public void run() throws Exception {
    Configuration configuration = new Configuration();
    FileSystem fs = FileSystem.get(configuration);

    String segmentPath = getAndSetConfiguration(configuration, SEGMENT_PUSH_INPUT_PATH);
    LOGGER.info("Segment path : {}", segmentPath);
    hosts = getAndSetConfiguration(configuration, SEGMENT_PUSH_CONTROLLER_HOSTS)
    port = getAndSetConfiguration(configuration, SEGMENT_PUSH_CONTROLLER_PORT);
    tablename = getAndCheck(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString());

    Path path = new Path(segmentPath);
    FileStatus[] fileStatusArr = fs.globStatus(path);
    for (FileStatus fileStatus : fileStatusArr) {
        if (fileStatus.isDirectory()) {
            pushDir(fs, fileStatus.getPath());
        } else {/*w w w  .  jav a 2s  .  c  o  m*/
            pushOneTarFile(fs, fileStatus.getPath());

    if (uploadSuccess && segmentName != null) {
        segmentPushControllerAPIs = new SegmentPushControllerAPIs(hosts, port);
        LOGGER.info("Deleting segments overlapping to {} from table {}  ", segmentName, tablename);
        segmentPushControllerAPIs.deleteOverlappingSegments(tablename, segmentName);


From source file:com.linkedin.whiteelephant.util.JobStatsProcessing.java

License:Apache License

public static List<ProcessingTask> getTasks(FileSystem fs, String logsRoot, String clusterName,
        String outputPathRoot, boolean incremental, int numDays, int numDaysForced) throws IOException {
    Calendar cal = Calendar.getInstance(timeZone);

    SimpleDateFormat yearFormat = new SimpleDateFormat("yyyy");
    SimpleDateFormat dayFormat = new SimpleDateFormat("MMdd");
    SimpleDateFormat idFormat = new SimpleDateFormat("yyyy-MM-dd");

    yearFormat.setTimeZone(timeZone);//  www.j a  v a  2 s .co m

    List<ProcessingTask> processingTasks = new ArrayList<ProcessingTask>();

    numDays = Math.max(numDays, numDaysForced);

    // Start processing previous day of data since current day isn't yet finished.  Unless we are aggregating hourly data there is no point.
    cal.add(Calendar.DAY_OF_MONTH, -1);

    int numPaths = 0;
    long totalLength = 0;
    for (int i = 0; i < numDays; i++, cal.add(Calendar.DAY_OF_MONTH, -1)) {
        Date date = cal.getTime();

        String pathFormat = String.format("%s/%s/daily/*/%s/%s/*.log", logsRoot, clusterName,
                yearFormat.format(date), dayFormat.format(date));
        FileStatus[] stats = fs.globStatus(new Path(pathFormat));

        StringBuilder msg = new StringBuilder(pathFormat + " => " + stats.length + " files");

        String outputPathForDay = String.format("%s/%s/%s/%s", outputPathRoot, clusterName,
                yearFormat.format(date), dayFormat.format(date));

        if (stats.length > 0) {
            if (!incremental || !fs.exists(new Path(outputPathForDay)) || i < numDaysForced) {
                for (FileStatus stat : stats) {
                    totalLength += stat.getLen();

                String id = clusterName + "-" + idFormat.format(date);


                processingTasks.add(new ProcessingTask(id, pathFormat, outputPathForDay, totalLength));
            } else if (incremental && fs.exists(new Path(outputPathForDay))) {
                msg.append(" (skipping)");

    System.out.println("Found " + numPaths + " paths to process, totalling " + totalLength + " bytes ("
            + (totalLength / (1024 * 1024 * 1024)) + " gigabytes)");

    return processingTasks;

From source file:com.moz.fiji.mapreduce.kvstore.lib.FileStoreHelper.java

License:Apache License

 * Deserializes file- and DistributedCache-specific properties associated
 * with the KeyValueStore that owns this FileStoreHelper from the specified configuration.
 * <p>This retains a reference to the KeyValueStoreConfiguration's backing Configuration
 * instance to use when opening files specified by this configuration.</p>
 * @param conf the configuration to read.
 * @throws IOException if there's an error deserializing the configuration.
 *///from   w ww  . j  a  v a2s .c o m
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {

    mDCachePrefix = conf.get(CONF_DCACHE_PREFIX_KEY, "");
    LOG.debug("Input dCachePrefix: " + mDCachePrefix);
    if (mDCachePrefix.isEmpty()) {
        // Read an ordinary list of files from the Configuration.
        // These may include directories and globs to expand.
        mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS_KEY, new String[0])),
                new Lists.Func<String, Path>() {
                    public Path eval(String in) {
                        LOG.debug("File input: " + in);
                        return new Path(in);
    } else {
        // Use the dcache prefix to get the names of the files for this store.
        // The symlinks are already present in the working dir of the task.
        final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
        FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
        if (null == statuses || statuses.length == 0) {
            throw new IOException("No files associated with the job in the DistributedCache");

        // Get the (absolute) input file paths to use.
        mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
            public Path eval(FileStatus status) {
                Path out = status.getPath().makeQualified(localFs);
                LOG.debug("Loaded from DistributedCache: " + out);
                return out;

    // If we are initializing a client-side instance to later serialize, the user may have
    // specified HDFS files, but also an intent to put the files in the DistributedCache. Set
    // this flag now, which will generate mDCachePrefix when addToConfiguration() is called
    // later.
    mUseDCache = conf.getBoolean(CONF_USE_DCACHE_KEY, USE_DCACHE_DEFAULT);

From source file:com.revolutionanalytics.hadoop.hdfs.DelayedExceptionThrowing.java

License:Apache License

final void globAndProcess(Path srcPattern, FileSystem srcFs) throws IOException {
    ArrayList<IOException> exceptions = new ArrayList<IOException>();
    for (Path p : FileUtil.stat2Paths(srcFs.globStatus(srcPattern), srcPattern))
        try {/*from  w w w  . j a  va2 s.  co m*/
            process(p, srcFs);
        } catch (IOException ioe) {

    if (!exceptions.isEmpty())
        if (exceptions.size() == 1)
            throw exceptions.get(0);
            throw new IOException("Multiple IOExceptions: " + exceptions);

From source file:com.revolutionanalytics.hadoop.hdfs.FileUtils.java

License:Apache License

private static void ls__(FileSystem srcFS, String path, ArrayList<String> lsco, boolean dorecurse)
        throws IOException, FileNotFoundException {
    Path spath = new Path(path);
    FileStatus[] srcs;/*from ww  w .  ja v  a2 s  .  c o m*/
    srcs = srcFS.globStatus(spath);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + path + ": No such file or directory.");
    if (srcs.length == 1 && srcs[0].isDir())
        srcs = srcFS.listStatus(srcs[0].getPath());
    Calendar c = Calendar.getInstance();
    for (FileStatus status : srcs) {
        StringBuilder sb = new StringBuilder();
        boolean idir = status.isDir();
        String x = idir ? "d" : "-";
        if (dorecurse && idir)
            ls__(srcFS, status.getPath().toUri().getPath(), lsco, dorecurse);
        else {




            Date d = new Date(status.getModificationTime());
