Example usage for org.apache.hadoop.fs FileUtil stat2Paths

List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats) 

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:com.cloudera.training.metrics.JobHistoryHelper.java

License:Apache License

public static JobHistory.JobInfo getJobInfoFromHdfsOutputDir(String outputDir, Configuration conf)
        throws IOException {
    Path output = new Path(outputDir);
    Path historyLogDir = new Path(output, "_logs/history");
    FileSystem fs = output.getFileSystem(conf);
    if (!fs.exists(output)) {
        throw new IOException("History directory " + historyLogDir.toString() + " does not exist");
    }//from w  ww  .j a v a2  s . com
    Path[] jobFiles = FileUtil.stat2Paths(fs.listStatus(historyLogDir, jobLogFileFilter));
    if (jobFiles.length == 0) {
        throw new IOException("Not a valid history directory " + historyLogDir.toString());
    }
    String[] jobDetails = JobHistory.JobInfo.decodeJobHistoryFileName(jobFiles[0].getName()).split("_");
    String jobId = jobDetails[2] + "_" + jobDetails[3] + "_" + jobDetails[4];
    JobHistory.JobInfo job = new JobHistory.JobInfo(jobId);
    DefaultJobHistoryParser.parseJobTasks(jobFiles[0].toString(), job, fs);
    return job;
}

From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java

License:Apache License

@Override
@After//from   ww  w.  ja v  a  2s  .c o  m
public void tearDown() throws Exception {
    /*
     * Make sure we have no scratch data left over.
     */
    FileStatus[] tmpFiles = getFileSystem().listStatus(new Path("tmp"));

    if (tmpFiles != null) {
        assertThat(Arrays.toString(FileUtil.stat2Paths(tmpFiles)), tmpFiles.length, equalTo(0));
    }

    super.tearDown();

    /*
     * Let the various services shut down so TemporaryFolder can delete things.
     */
    Thread.sleep(2000);
}

From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;/*  w w w.  j  a  v  a 2 s .  c om*/
    } else {
        minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        Path p = new Path(paths[i].toUri());
        newpaths.add(p);
    }
    paths = null;

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:com.mediav.hadoop.MVHistoryViewer.java

License:Apache License

public void printJobInfoLoop() throws IOException {

    Path[] jobFiles = FileUtil.stat2Paths(fs.listStatus(historyLogDir, jobLogFileFilter));
    for (int i = 0; i < jobFiles.length; i++) {

        if (fs.getFileStatus(jobFiles[0]).getLen() == 0) {
            continue;
        } else {//ww  w  . j  ava  2s.  co  m
            String[] jobDetails = JobHistory.JobInfo.decodeJobHistoryFileName(jobFiles[i].getName()).split("_");

            String jobId = jobDetails[0] + "_" + jobDetails[1] + "_" + jobDetails[2];
            JobHistory.JobInfo job = new JobHistory.JobInfo(jobId);
            MVJobHistoryParser.parseJobTasks(jobFiles[i].toString(), job, fs);
            toJson(job);
        }
    }

}

From source file:com.netease.news.utils.SequenceFileDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from w  w  w  .j  av  a  2  s  . co  m
    addOutputOption();
    addOption("substring", "b", "The number of chars to print out per value", false);
    addOption(buildOption("count", "c", "Report the count only", false, false, null));
    addOption("numItems", "n", "Output at most <n> key value pairs", false);
    addOption(
            buildOption("facets", "fa", "Output the counts per key.  Note, if there are a lot of unique keys, "
                    + "this can take up a fair amount of memory", false, false, null));
    addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    Path input = getInputPath();
    FileSystem fs = input.getFileSystem(conf);
    if (fs.getFileStatus(input).isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        pathArr = new Path[1];
        pathArr[0] = input;
    }

    Writer writer;
    boolean shouldClose;
    if (hasOption("output")) {
        shouldClose = true;
        writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        for (Path path : pathArr) {
            if (!hasOption("quiet")) {
                writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
            }

            int sub = Integer.MAX_VALUE;
            if (hasOption("substring")) {
                sub = Integer.parseInt(getOption("substring"));
            }
            boolean countOnly = hasOption("count");
            SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true,
                    conf);
            if (!hasOption("quiet")) {
                writer.append("Key class: ").append(iterator.getKeyClass().toString());
                writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
            }
            OpenObjectIntHashMap<String> facets = null;
            if (hasOption("facets")) {
                facets = new OpenObjectIntHashMap<String>();
            }
            long count = 0;
            if (countOnly) {
                while (iterator.hasNext()) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                writer.append("Count: ").append(String.valueOf(count)).append('\n');
            } else {
                long numItems = Long.MAX_VALUE;
                if (hasOption("numItems")) {
                    numItems = Long.parseLong(getOption("numItems"));
                    if (!hasOption("quiet")) {
                        writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
                    }
                }
                while (iterator.hasNext() && count < numItems) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    writer.append("Key: ").append(key);
                    String str = record.getSecond().toString();
                    writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
                    writer.write('\n');
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                if (!hasOption("quiet")) {
                    writer.append("Count: ").append(String.valueOf(count)).append('\n');
                }
            }
            if (facets != null) {
                List<String> keyList = Lists.newArrayListWithCapacity(facets.size());

                IntArrayList valueList = new IntArrayList(facets.size());
                facets.pairsSortedByKey(keyList, valueList);
                writer.append("-----Facets---\n");
                writer.append("Key\t\tCount\n");
                int i = 0;
                for (String key : keyList) {
                    writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
                }
            }
        }
        writer.flush();

    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:com.skp.experiment.common.mapreduce.MapFileOutputFormat.java

License:Apache License

/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));

    // sort names, so that hash partitioning works
    Arrays.sort(names);/*from  w ww. j  ava2s  . c o m*/

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}

From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;//ww  w.j  a  va  2s .  co  m
    } else {
        minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        //Path p = new Path(paths[i].toUri().getPath());
        Path p = paths[i];
        newpaths.add(p);
    }
    paths = null;

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:com.twitter.algebra.matrix.format.MapDir.java

License:Apache License

/**
 * [Re-]Initialize the MapDir with new directory of MapFiles. {@link #close()}
 * must be called before if MapDir is already initialized.
 * // w  w  w.j  a  v  a 2s. c om
 * @param dir
 * @param conf
 * @return
 * @throws IOException
 */
@SuppressWarnings("deprecation")
public MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, mapFilter));

    partitioner = new TitlePartitioner(names);

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}

From source file:com.twitter.elephanttwin.util.HdfsUtils.java

License:Apache License

public static Iterable<Path> getSubdirectories(final boolean recursive, final String baseDirectory,
        final FileSystem hdfs) throws IOException {

    FileStatus[] fileStat;//from   ww w  . j ava2  s . c o  m
    Path basePath = new Path(baseDirectory);
    if (!hdfs.exists(basePath)) {
        throw new IOException(
                hdfs.getWorkingDirectory() + baseDirectory + " does not exist, cannot getSubdirectories");
    }
    FileStatus status = hdfs.getFileStatus(basePath);
    if (!status.isDir()) {
        LOG.warning("tried to find subdirectories of " + status.getPath() + ", but it is a file");
        return Lists.newArrayList(status.getPath());
    }
    // get the stat on all files in the source directory
    fileStat = hdfs.listStatus(basePath);

    if (fileStat == null) {
        throw new IOException(
                "FileSystem.listStatus(" + basePath + ") returned null, cannot getSubdirectories");
    }

    // get paths to the files in the source directory
    return Arrays.asList(FileUtil.stat2Paths(fileStat));
}

From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;/*from   ww  w.  j  a  va 2  s.com*/
    } else {
        minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        Path p = new Path(paths[i].toUri().getPath());
        newpaths.add(p);
    }
    paths = null;

    System.out.println("Getting splits for: " + newpaths.size() + " paths.");

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        System.out.println("Getting splits for a pool");

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        System.out.println("Getting splits. myPaths size: " + myPaths.size());
        // create splits for all files in this pool.
        getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}