List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.hortonworks.pso.data.generator.mapreduce.DataGenInputFormat.java
License:Apache License
/** * Create the desired number of splits, dividing the number of rows * between the mappers.//from www .j ava 2s . c om */ public List<InputSplit> getSplits(JobContext job) { long totalRows = getNumberOfRows(job); int numSplits = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1); LOG.info("Generating " + totalRows + " using " + numSplits); List<InputSplit> splits = new ArrayList<InputSplit>(); long currentRow = 0; for (int split = 0; split < numSplits; ++split) { long goal = (long) Math.ceil(totalRows * (double) (split + 1) / numSplits); splits.add(new DataGenInputSplit(currentRow, goal - currentRow)); currentRow = goal; } return splits; }
From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;/*ww w . jav a2 s . c om*/ } else { minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { Path p = new Path(paths[i].toUri()); newpaths.add(p); } paths = null; // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } // create splits for all files in this pool. getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }
From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java
License:Apache License
/** * Return all the splits in the specified set of paths *//*ww w . j av a2s . c o m*/ @SuppressWarnings("unused") private void getMoreSplits(JobContext job, Path[] paths, long maxSize, long minSizeNode, long minSizeRack, List<InputSplit> splits) throws IOException { Configuration conf = job.getConfiguration(); // all blocks for all the files in input set OneFileInfo[] files; // mapping from a rack name to the list of blocks it has HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>(); // mapping from a block to the nodes on which it has replicas HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>(); // mapping from a node to the list of blocks that it contains HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>(); files = new OneFileInfo[paths.length]; if (paths.length == 0) { return; } // populate all the blocks for all files long totLength = 0; for (int i = 0; i < paths.length; i++) { files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]), rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes, maxSize); totLength += files[i].getLength(); } ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>(); Set<String> nodes = new HashSet<String>(); long curSplitSize = 0; // process all nodes and create splits that are local // to a node. for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter .hasNext();) { Map.Entry<String, List<OneBlockInfo>> one = iter.next(); nodes.add(one.getKey()); List<OneBlockInfo> blocksInNode = one.getValue(); // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. for (OneBlockInfo oneblock : blocksInNode) { if (blockToNodes.containsKey(oneblock)) { validBlocks.add(oneblock); blockToNodes.remove(oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, nodes, validBlocks); curSplitSize = 0; validBlocks.clear(); } } } // if there were any blocks left over and their combined size is // larger than minSplitNode, then combine them into one split. // Otherwise add them back to the unprocessed pool. It is likely // that they will be combined with other blocks from the // same rack later on. if (minSizeNode != 0 && curSplitSize >= minSizeNode) { // create an input split and add it to the splits array addCreatedSplit(splits, nodes, validBlocks); } else { for (OneBlockInfo oneblock : validBlocks) { blockToNodes.put(oneblock, oneblock.hosts); } } validBlocks.clear(); nodes.clear(); curSplitSize = 0; } // if blocks in a rack are below the specified minimum size, then keep them // in 'overflow'. After the processing of all racks is complete, these // overflow blocks will be combined into splits. ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>(); Set<String> racks = new HashSet<String>(); // Process all racks over and over again until there is no more work to do. while (blockToNodes.size() > 0) { // Create one split for this rack before moving over to the next rack. // Come back to this rack after creating a single split for each of the // remaining racks. // Process one rack location at a time, Combine all possible blocks that // reside on this rack as one split. (constrained by minimum and maximum // split size). // iterate over all racks for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter .hasNext();) { Map.Entry<String, List<OneBlockInfo>> one = iter.next(); racks.add(one.getKey()); List<OneBlockInfo> blocks = one.getValue(); // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. boolean createdSplit = false; for (OneBlockInfo oneblock : blocks) { if (blockToNodes.containsKey(oneblock)) { validBlocks.add(oneblock); blockToNodes.remove(oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, getHosts(racks), validBlocks); createdSplit = true; break; } } } // if we created a split, then just go to the next rack if (createdSplit) { curSplitSize = 0; validBlocks.clear(); racks.clear(); continue; } if (!validBlocks.isEmpty()) { if (minSizeRack != 0 && curSplitSize >= minSizeRack) { // if there is a minimum size specified, then create a single split // otherwise, store these blocks into overflow data structure addCreatedSplit(splits, getHosts(racks), validBlocks); } else { // There were a few blocks in this rack that // remained to be processed. Keep them in 'overflow' block list. // These will be combined later. overflowBlocks.addAll(validBlocks); } } curSplitSize = 0; validBlocks.clear(); racks.clear(); } } assert blockToNodes.isEmpty(); assert curSplitSize == 0; assert validBlocks.isEmpty(); assert racks.isEmpty(); // Process all overflow blocks for (OneBlockInfo oneblock : overflowBlocks) { validBlocks.add(oneblock); curSplitSize += oneblock.length; // This might cause an exiting rack location to be re-added, // but it should be ok. for (int i = 0; i < oneblock.racks.length; i++) { racks.add(oneblock.racks[i]); } // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, getHosts(racks), validBlocks); curSplitSize = 0; validBlocks.clear(); racks.clear(); } } // Process any remaining blocks, if any. if (!validBlocks.isEmpty()) { addCreatedSplit(splits, getHosts(racks), validBlocks); } }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.BeFileInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { logger.debug("BeFileInputFormat.getSplits"); super.setMaxSplitSize(MAX_SPLIT_SIZE); try {/*from www. j av a 2 s . c om*/ final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> { final List<InputSplit> tmp = super.getSplits(context); String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE); if (null != debug_max_str) { final int requested_records = Integer.parseInt(debug_max_str); // dump 5* the request number of splits into one mega split // to strike a balance between limiting the data and making sure for // tests that enough records are generated final CombineFileSplit combined = new CombineFileSplit( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records) .<Path>toArray(size -> new Path[size]), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getLengths()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations()))) .limit(5L * requested_records).<String>toArray(size -> new String[size])); return Arrays.<InputSplit>asList(combined); } else return tmp; })); logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null")); return splits; } catch (Throwable t) { logger.error(t); throw new IOException(t); } }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;/* w w w.ja va 2 s . c om*/ } else { minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0); // If maxSize is not configured, a single split will be generated per // node. } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack " + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node " + minSizeNode + " cannot be larger than minimum split " + "size per rack " + minSizeRack); } // all the files in input set List<FileStatus> stats = listStatus(job); List<InputSplit> splits = new ArrayList<InputSplit>(); if (stats.size() == 0) { return splits; } // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<FileStatus> myPaths = new ArrayList<FileStatus>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<FileStatus> iter = stats.iterator(); iter.hasNext();) { FileStatus p = iter.next(); if (onepool.accept(p.getPath())) { myPaths.add(p); // add it to my output set iter.remove(); } } // create splits for all files in this pool. getMoreSplits(job, myPaths, maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(job, stats, maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java
License:Apache License
/** * Return all the splits in the specified set of paths *///from www . ja v a2 s .c om private void getMoreSplits(JobContext job, List<FileStatus> stats, long maxSize, long minSizeNode, long minSizeRack, List<InputSplit> splits) throws IOException { Configuration conf = job.getConfiguration(); // all blocks for all the files in input set OneFileInfo[] files; // mapping from a rack name to the list of blocks it has HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>(); // mapping from a block to the nodes on which it has replicas HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>(); // mapping from a node to the list of blocks that it contains HashMap<String, Set<OneBlockInfo>> nodeToBlocks = new HashMap<String, Set<OneBlockInfo>>(); files = new OneFileInfo[stats.size()]; if (stats.size() == 0) { return; } // populate all the blocks for all files long totLength = 0; int i = 0; for (FileStatus stat : stats) { files[i] = new OneFileInfo(stat, conf, isSplitable(job, stat.getPath()), rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes, maxSize); totLength += files[i].getLength(); } createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, maxSize, minSizeNode, minSizeRack, splits); }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
/** * @param job/*from w w w .j a va 2 s .c om*/ * the job to look at. * @return should the files to be read recursively? */ public static boolean getInputDirRecursive(JobContext job) { return job.getConfiguration().getBoolean(INPUT_DIR_RECURSIVE, false); }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. /*from w w w . j ava2s .co m*/ * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); List<FileStatus> result = null; int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS); Stopwatch sw = Stopwatch.createStarted(); if (numThreads == 1) { result = singleThreadedListStatus(job, dirs, inputFilter, recursive); } else { Iterable<FileStatus> locatedFiles = null; try { LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( job.getConfiguration(), dirs, recursive, inputFilter, true); locatedFiles = locatedFileStatusFetcher.getFileStatuses(); } catch (InterruptedException e) { throw new IOException("Interrupted while getting file statuses"); } result = Lists.newArrayList(locatedFiles); } sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Time taken to get FileStatuses: " + sw.elapsed(TimeUnit.MILLISECONDS)); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i];/*from www . j a v a 2 s . c o m*/ FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context//from ww w . j a v a 2 s. c o m * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { Stopwatch sw = Stopwatch.createStarted(); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { FileSystem fs = path.getFileSystem(job.getConfiguration()); blkLocations = fs.getFileBlockLocations(file, 0, length); } if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.elapsed(TimeUnit.MILLISECONDS)); } return splits; }