List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths
public static Path[] stat2Paths(FileStatus[] stats)
From source file:com.cloudera.training.metrics.JobHistoryHelper.java
License:Apache License
public static JobHistory.JobInfo getJobInfoFromHdfsOutputDir(String outputDir, Configuration conf) throws IOException { Path output = new Path(outputDir); Path historyLogDir = new Path(output, "_logs/history"); FileSystem fs = output.getFileSystem(conf); if (!fs.exists(output)) { throw new IOException("History directory " + historyLogDir.toString() + " does not exist"); }//from w ww .j a v a2 s . com Path[] jobFiles = FileUtil.stat2Paths(fs.listStatus(historyLogDir, jobLogFileFilter)); if (jobFiles.length == 0) { throw new IOException("Not a valid history directory " + historyLogDir.toString()); } String[] jobDetails = JobHistory.JobInfo.decodeJobHistoryFileName(jobFiles[0].getName()).split("_"); String jobId = jobDetails[2] + "_" + jobDetails[3] + "_" + jobDetails[4]; JobHistory.JobInfo job = new JobHistory.JobInfo(jobId); DefaultJobHistoryParser.parseJobTasks(jobFiles[0].toString(), job, fs); return job; }
From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java
License:Apache License
@Override @After//from ww w. ja v a 2s .c o m public void tearDown() throws Exception { /* * Make sure we have no scratch data left over. */ FileStatus[] tmpFiles = getFileSystem().listStatus(new Path("tmp")); if (tmpFiles != null) { assertThat(Arrays.toString(FileUtil.stat2Paths(tmpFiles)), tmpFiles.length, equalTo(0)); } super.tearDown(); /* * Let the various services shut down so TemporaryFolder can delete things. */ Thread.sleep(2000); }
From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;/* w w w. j a v a 2 s . c om*/ } else { minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { Path p = new Path(paths[i].toUri()); newpaths.add(p); } paths = null; // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } // create splits for all files in this pool. getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }
From source file:com.mediav.hadoop.MVHistoryViewer.java
License:Apache License
public void printJobInfoLoop() throws IOException { Path[] jobFiles = FileUtil.stat2Paths(fs.listStatus(historyLogDir, jobLogFileFilter)); for (int i = 0; i < jobFiles.length; i++) { if (fs.getFileStatus(jobFiles[0]).getLen() == 0) { continue; } else {//ww w . j ava 2s. co m String[] jobDetails = JobHistory.JobInfo.decodeJobHistoryFileName(jobFiles[i].getName()).split("_"); String jobId = jobDetails[0] + "_" + jobDetails[1] + "_" + jobDetails[2]; JobHistory.JobInfo job = new JobHistory.JobInfo(jobId); MVJobHistoryParser.parseJobTasks(jobFiles[i].toString(), job, fs); toJson(job); } } }
From source file:com.netease.news.utils.SequenceFileDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w .j av a 2 s . co m addOutputOption(); addOption("substring", "b", "The number of chars to print out per value", false); addOption(buildOption("count", "c", "Report the count only", false, false, null)); addOption("numItems", "n", "Output at most <n> key value pairs", false); addOption( buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, " + "this can take up a fair amount of memory", false, false, null)); addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); Path input = getInputPath(); FileSystem fs = input.getFileSystem(conf); if (fs.getFileStatus(input).isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter())); } else { pathArr = new Path[1]; pathArr[0] = input; } Writer writer; boolean shouldClose; if (hasOption("output")) { shouldClose = true; writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { for (Path path : pathArr) { if (!hasOption("quiet")) { writer.append("Input Path: ").append(String.valueOf(path)).append('\n'); } int sub = Integer.MAX_VALUE; if (hasOption("substring")) { sub = Integer.parseInt(getOption("substring")); } boolean countOnly = hasOption("count"); SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf); if (!hasOption("quiet")) { writer.append("Key class: ").append(iterator.getKeyClass().toString()); writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n'); } OpenObjectIntHashMap<String> facets = null; if (hasOption("facets")) { facets = new OpenObjectIntHashMap<String>(); } long count = 0; if (countOnly) { while (iterator.hasNext()) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } writer.append("Count: ").append(String.valueOf(count)).append('\n'); } else { long numItems = Long.MAX_VALUE; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (!hasOption("quiet")) { writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n"); } } while (iterator.hasNext() && count < numItems) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); writer.append("Key: ").append(key); String str = record.getSecond().toString(); writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str); writer.write('\n'); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } if (!hasOption("quiet")) { writer.append("Count: ").append(String.valueOf(count)).append('\n'); } } if (facets != null) { List<String> keyList = Lists.newArrayListWithCapacity(facets.size()); IntArrayList valueList = new IntArrayList(facets.size()); facets.pairsSortedByKey(keyList, valueList); writer.append("-----Facets---\n"); writer.append("Key\t\tCount\n"); int i = 0; for (String key : keyList) { writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n'); } } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:com.skp.experiment.common.mapreduce.MapFileOutputFormat.java
License:Apache License
/** Open the output generated by this format. */ public static MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException { FileSystem fs = dir.getFileSystem(conf); Path[] names = FileUtil.stat2Paths(fs.listStatus(dir)); // sort names, so that hash partitioning works Arrays.sort(names);/*from w ww. j ava2s . c o m*/ MapFile.Reader[] parts = new MapFile.Reader[names.length]; for (int i = 0; i < names.length; i++) { parts[i] = new MapFile.Reader(fs, names[i].toString(), conf); } return parts; }
From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;//ww w.j a va 2s . co m } else { minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapred.max.split.size", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { //Path p = new Path(paths[i].toUri().getPath()); Path p = paths[i]; newpaths.add(p); } paths = null; // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } // create splits for all files in this pool. getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }
From source file:com.twitter.algebra.matrix.format.MapDir.java
License:Apache License
/** * [Re-]Initialize the MapDir with new directory of MapFiles. {@link #close()} * must be called before if MapDir is already initialized. * // w w w.j a v a 2s. c om * @param dir * @param conf * @return * @throws IOException */ @SuppressWarnings("deprecation") public MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException { FileSystem fs = dir.getFileSystem(conf); Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, mapFilter)); partitioner = new TitlePartitioner(names); MapFile.Reader[] parts = new MapFile.Reader[names.length]; for (int i = 0; i < names.length; i++) { parts[i] = new MapFile.Reader(fs, names[i].toString(), conf); } return parts; }
From source file:com.twitter.elephanttwin.util.HdfsUtils.java
License:Apache License
public static Iterable<Path> getSubdirectories(final boolean recursive, final String baseDirectory, final FileSystem hdfs) throws IOException { FileStatus[] fileStat;//from ww w . j ava2 s . c o m Path basePath = new Path(baseDirectory); if (!hdfs.exists(basePath)) { throw new IOException( hdfs.getWorkingDirectory() + baseDirectory + " does not exist, cannot getSubdirectories"); } FileStatus status = hdfs.getFileStatus(basePath); if (!status.isDir()) { LOG.warning("tried to find subdirectories of " + status.getPath() + ", but it is a file"); return Lists.newArrayList(status.getPath()); } // get the stat on all files in the source directory fileStat = hdfs.listStatus(basePath); if (fileStat == null) { throw new IOException( "FileSystem.listStatus(" + basePath + ") returned null, cannot getSubdirectories"); } // get paths to the files in the source directory return Arrays.asList(FileUtil.stat2Paths(fileStat)); }
From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;/*from ww w. j a va 2 s.com*/ } else { minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapred.max.split.size", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { Path p = new Path(paths[i].toUri().getPath()); newpaths.add(p); } paths = null; System.out.println("Getting splits for: " + newpaths.size() + " paths."); // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); System.out.println("Getting splits for a pool"); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } System.out.println("Getting splits. myPaths size: " + myPaths.size()); // create splits for all files in this pool. getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }