Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:com.linkedin.cubert.utils.CommonUtils.java

License:Open Source License

public static Path getAFileInPath(Configuration conf, Path path, String suffix) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.getFileStatus(path).isDir()) {
        Path globPath = new Path(path, "*." + suffix);
        FileStatus[] allFiles = fs.globStatus(globPath);
        if (allFiles.length == 0) {
            throw new IOException("there are no files in " + path.toString());
        }//from   ww  w.  jav  a 2 s . co m

        path = allFiles[0].getPath();
    }

    print.f("Obtaining schema of %s file %s", suffix, path.toString());

    return path;
}

From source file:com.linkedin.cubert.utils.FileSystemUtils.java

License:Open Source License

public static List<Path> getGlobPaths(FileSystem fs, Path path) throws IOException {
    List<Path> paths = new ArrayList<Path>();

    FileStatus[] fileStatus = fs.globStatus(path);

    if (fileStatus == null)
        throw new IOException("Cannot determine paths at " + path.toString());

    for (FileStatus status : fileStatus) {
        paths.add(status.getPath());/*  www.ja v  a2  s  .  co  m*/
    }

    return paths;
}

From source file:com.linkedin.cubert.utils.FileSystemUtils.java

License:Open Source License

public static Path getLatestPath(FileSystem fs, Path path) throws IOException {
    String pathStr = path.toString();

    // Return the same path, if there is no "#LATEST" within it
    if (!pathStr.contains("#LATEST"))
        return path;

    // replace all #LATEST with glob "*"
    pathStr = pathStr.replaceAll("#LATEST", "*");

    FileStatus[] fileStatus = fs.globStatus(new Path(pathStr));

    if (fileStatus == null || fileStatus.length == 0)
        throw new IOException("Cannot determine paths at " + pathStr);

    String latestPath = null;/*from  www  . j av  a 2 s .c om*/
    for (FileStatus status : fileStatus) {
        String thisPath = status.getPath().toString();
        if (latestPath == null || thisPath.compareTo(latestPath) > 0)
            latestPath = thisPath;

    }
    return new Path(latestPath);
}

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

public void run() throws Exception {
    LOGGER.info("Starting {}", getClass().getSimpleName());

    FileSystem fs = FileSystem.get(getConf());
    Path inputPathPattern = new Path(_inputSegmentDir);

    if (fs.exists(new Path(_stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(_stagingDir), true);
    }/*from  w ww  . ja v  a  2 s  .  co m*/
    fs.mkdirs(new Path(_stagingDir));
    fs.mkdirs(new Path(_stagingDir + "/input/"));

    if (fs.exists(new Path(_outputDir))) {
        LOGGER.warn("Found the output folder, deleting it");
        fs.delete(new Path(_outputDir), true);
    }
    fs.mkdirs(new Path(_outputDir));

    List<FileStatus> inputDataFiles = new ArrayList<FileStatus>();
    FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern);
    for (FileStatus fileStatus : fileStatusArr) {
        inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath()));
    }

    for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
        FileStatus file = inputDataFiles.get(seqId);
        String completeFilePath = " " + file.getPath().toString() + " " + seqId;
        Path newOutPutFile = new Path((_stagingDir + "/input/"
                + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt"));
        FSDataOutputStream stream = fs.create(newOutPutFile);
        stream.writeUTF(completeFilePath);
        stream.flush();
        stream.close();
    }

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationJob.class);
    job.setJobName(_jobName);

    job.setMapperClass(HadoopSegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema));

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : _properties.keySet()) {
        job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString()));
    }

    if (_depsJarPath != null && _depsJarPath.length() > 0) {
        addDepsJarToDistributedCache(new Path(_depsJarPath), job);
    }

    // Submit the job for execution.
    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", _stagingDir);
    fs.delete(new Path(_stagingDir), true);
}

From source file:com.linkedin.pinot.hadoop.job.SegmentTarPushJob.java

License:Apache License

public void run() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path path = new Path(_segmentPath);
    FileStatus[] fileStatusArr = fs.globStatus(path);
    for (FileStatus fileStatus : fileStatusArr) {
        if (fileStatus.isDirectory()) {
            pushDir(fs, fileStatus.getPath());
        } else {// www .  j  a  v a2  s .  co m
            pushOneTarFile(fs, fileStatus.getPath());
        }
    }

}

From source file:com.linkedin.thirdeye.hadoop.push.SegmentPushPhase.java

License:Apache License

public void run() throws Exception {
    Configuration configuration = new Configuration();
    FileSystem fs = FileSystem.get(configuration);

    String segmentPath = getAndSetConfiguration(configuration, SEGMENT_PUSH_INPUT_PATH);
    LOGGER.info("Segment path : {}", segmentPath);
    hosts = getAndSetConfiguration(configuration, SEGMENT_PUSH_CONTROLLER_HOSTS)
            .split(ThirdEyeConstants.FIELD_SEPARATOR);
    port = getAndSetConfiguration(configuration, SEGMENT_PUSH_CONTROLLER_PORT);
    tablename = getAndCheck(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString());

    Path path = new Path(segmentPath);
    FileStatus[] fileStatusArr = fs.globStatus(path);
    for (FileStatus fileStatus : fileStatusArr) {
        if (fileStatus.isDirectory()) {
            pushDir(fs, fileStatus.getPath());
        } else {/*w w w  .  jav a 2s  .  c  o  m*/
            pushOneTarFile(fs, fileStatus.getPath());
        }
    }

    if (uploadSuccess && segmentName != null) {
        segmentPushControllerAPIs = new SegmentPushControllerAPIs(hosts, port);
        LOGGER.info("Deleting segments overlapping to {} from table {}  ", segmentName, tablename);
        segmentPushControllerAPIs.deleteOverlappingSegments(tablename, segmentName);
    }

}

From source file:com.linkedin.whiteelephant.util.JobStatsProcessing.java

License:Apache License

public static List<ProcessingTask> getTasks(FileSystem fs, String logsRoot, String clusterName,
        String outputPathRoot, boolean incremental, int numDays, int numDaysForced) throws IOException {
    Calendar cal = Calendar.getInstance(timeZone);

    SimpleDateFormat yearFormat = new SimpleDateFormat("yyyy");
    SimpleDateFormat dayFormat = new SimpleDateFormat("MMdd");
    SimpleDateFormat idFormat = new SimpleDateFormat("yyyy-MM-dd");

    yearFormat.setTimeZone(timeZone);//  www.j a  v a  2 s .co m
    dayFormat.setTimeZone(timeZone);
    idFormat.setTimeZone(timeZone);

    List<ProcessingTask> processingTasks = new ArrayList<ProcessingTask>();

    numDays = Math.max(numDays, numDaysForced);

    // Start processing previous day of data since current day isn't yet finished.  Unless we are aggregating hourly data there is no point.
    cal.add(Calendar.DAY_OF_MONTH, -1);

    int numPaths = 0;
    long totalLength = 0;
    for (int i = 0; i < numDays; i++, cal.add(Calendar.DAY_OF_MONTH, -1)) {
        Date date = cal.getTime();

        String pathFormat = String.format("%s/%s/daily/*/%s/%s/*.log", logsRoot, clusterName,
                yearFormat.format(date), dayFormat.format(date));
        FileStatus[] stats = fs.globStatus(new Path(pathFormat));

        StringBuilder msg = new StringBuilder(pathFormat + " => " + stats.length + " files");

        String outputPathForDay = String.format("%s/%s/%s/%s", outputPathRoot, clusterName,
                yearFormat.format(date), dayFormat.format(date));

        if (stats.length > 0) {
            if (!incremental || !fs.exists(new Path(outputPathForDay)) || i < numDaysForced) {
                for (FileStatus stat : stats) {
                    totalLength += stat.getLen();
                    numPaths++;
                }

                String id = clusterName + "-" + idFormat.format(date);

                System.out.println(msg);

                processingTasks.add(new ProcessingTask(id, pathFormat, outputPathForDay, totalLength));
            } else if (incremental && fs.exists(new Path(outputPathForDay))) {
                msg.append(" (skipping)");
                System.out.println(msg);
            }
        }
    }

    System.out.println("Found " + numPaths + " paths to process, totalling " + totalLength + " bytes ("
            + (totalLength / (1024 * 1024 * 1024)) + " gigabytes)");

    return processingTasks;
}

From source file:com.moz.fiji.mapreduce.kvstore.lib.FileStoreHelper.java

License:Apache License

/**
 * Deserializes file- and DistributedCache-specific properties associated
 * with the KeyValueStore that owns this FileStoreHelper from the specified configuration.
 *
 * <p>This retains a reference to the KeyValueStoreConfiguration's backing Configuration
 * instance to use when opening files specified by this configuration.</p>
 *
 * @param conf the configuration to read.
 * @throws IOException if there's an error deserializing the configuration.
 *///from   w ww  . j  a  v a2s .c o m
public void initFromConf(KeyValueStoreConfiguration conf) throws IOException {
    setConf(conf.getDelegate());

    mDCachePrefix = conf.get(CONF_DCACHE_PREFIX_KEY, "");
    LOG.debug("Input dCachePrefix: " + mDCachePrefix);
    if (mDCachePrefix.isEmpty()) {
        // Read an ordinary list of files from the Configuration.
        // These may include directories and globs to expand.
        mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS_KEY, new String[0])),
                new Lists.Func<String, Path>() {
                    @Override
                    public Path eval(String in) {
                        LOG.debug("File input: " + in);
                        return new Path(in);
                    }
                });
    } else {
        // Use the dcache prefix to get the names of the files for this store.
        // The symlinks are already present in the working dir of the task.
        final FileSystem localFs = FileSystem.getLocal(conf.getDelegate());
        FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*"));
        if (null == statuses || statuses.length == 0) {
            throw new IOException("No files associated with the job in the DistributedCache");
        }

        // Get the (absolute) input file paths to use.
        mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() {
            @Override
            public Path eval(FileStatus status) {
                Path out = status.getPath().makeQualified(localFs);
                LOG.debug("Loaded from DistributedCache: " + out);
                return out;
            }
        });
    }

    // If we are initializing a client-side instance to later serialize, the user may have
    // specified HDFS files, but also an intent to put the files in the DistributedCache. Set
    // this flag now, which will generate mDCachePrefix when addToConfiguration() is called
    // later.
    mUseDCache = conf.getBoolean(CONF_USE_DCACHE_KEY, USE_DCACHE_DEFAULT);
}

From source file:com.revolutionanalytics.hadoop.hdfs.DelayedExceptionThrowing.java

License:Apache License

final void globAndProcess(Path srcPattern, FileSystem srcFs) throws IOException {
    ArrayList<IOException> exceptions = new ArrayList<IOException>();
    for (Path p : FileUtil.stat2Paths(srcFs.globStatus(srcPattern), srcPattern))
        try {/*from  w w w  . j a  va2 s.  co m*/
            process(p, srcFs);
        } catch (IOException ioe) {
            exceptions.add(ioe);
        }

    if (!exceptions.isEmpty())
        if (exceptions.size() == 1)
            throw exceptions.get(0);
        else
            throw new IOException("Multiple IOExceptions: " + exceptions);
}

From source file:com.revolutionanalytics.hadoop.hdfs.FileUtils.java

License:Apache License

private static void ls__(FileSystem srcFS, String path, ArrayList<String> lsco, boolean dorecurse)
        throws IOException, FileNotFoundException {
    Path spath = new Path(path);
    FileStatus[] srcs;/*from ww  w .  ja v  a2 s  .  c o m*/
    srcs = srcFS.globStatus(spath);
    if (srcs == null || srcs.length == 0) {
        throw new FileNotFoundException("Cannot access " + path + ": No such file or directory.");
    }
    if (srcs.length == 1 && srcs[0].isDir())
        srcs = srcFS.listStatus(srcs[0].getPath());
    Calendar c = Calendar.getInstance();
    for (FileStatus status : srcs) {
        StringBuilder sb = new StringBuilder();
        boolean idir = status.isDir();
        String x = idir ? "d" : "-";
        if (dorecurse && idir)
            ls__(srcFS, status.getPath().toUri().getPath(), lsco, dorecurse);
        else {
            sb.append(x);
            sb.append(status.getPermission().toString());
            sb.append(fsep);

            sb.append(status.getOwner());
            sb.append(fsep);

            sb.append(status.getGroup());
            sb.append(fsep);

            sb.append(status.getLen());
            sb.append(fsep);

            Date d = new Date(status.getModificationTime());
            sb.append(formatter.format(d));
            sb.append(fsep);

            sb.append(status.getPath().toUri().getPath());
            lsco.add(sb.toString());
        }
    }
}