List of usage examples for org.apache.hadoop.mapreduce.lib.input CombineFileSplit CombineFileSplit
public CombineFileSplit(Path[] files, long[] start, long[] lengths, String[] locations)
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.HDFSSplitIterator.java
License:Apache License
public HDFSSplitIterator(FileSystem fs, Path[] paths, long[] offsets, long[] lengths, long startTime, long endTime) throws IOException { this.fs = fs; this.split = new CombineFileSplit(paths, offsets, lengths, null); while (currentHopIndex < split.getNumPaths() && !fs.exists(split.getPath(currentHopIndex))) { logger.warn(LocalizedMessage.create(LocalizedStrings.HOPLOG_CLEANED_UP_BY_JANITOR, split.getPath(currentHopIndex))); currentHopIndex++;/*from w w w . j av a2 s .c o m*/ } if (currentHopIndex == split.getNumPaths()) { this.hoplog = null; iterator = null; } else { this.hoplog = getHoplog(fs, split.getPath(currentHopIndex)); iterator = hoplog.getReader().scan(split.getOffset(currentHopIndex), split.getLength(currentHopIndex)); } this.startTime = startTime; this.endTime = endTime; }
From source file:com.hp.hpit.cs.MyCombineFileInputFormat.java
License:Apache License
/** * Create a single split from the list of blocks specified in validBlocks * Add this new split into splitList./* ww w . j av a 2s . c o m*/ */ private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations, ArrayList<OneBlockInfo> validBlocks) { // create an input split Path[] fl = new Path[validBlocks.size()]; long[] offset = new long[validBlocks.size()]; long[] length = new long[validBlocks.size()]; for (int i = 0; i < validBlocks.size(); i++) { fl[i] = validBlocks.get(i).onepath; offset[i] = validBlocks.get(i).offset; length[i] = validBlocks.get(i).length; } // add this split to the list that is returned CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0])); splitList.add(thissplit); }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.BeFileInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { logger.debug("BeFileInputFormat.getSplits"); super.setMaxSplitSize(MAX_SPLIT_SIZE); try {/*from ww w.ja v a2 s.c om*/ final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> { final List<InputSplit> tmp = super.getSplits(context); String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE); if (null != debug_max_str) { final int requested_records = Integer.parseInt(debug_max_str); // dump 5* the request number of splits into one mega split // to strike a balance between limiting the data and making sure for // tests that enough records are generated final CombineFileSplit combined = new CombineFileSplit( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records) .<Path>toArray(size -> new Path[size]), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getLengths()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations()))) .limit(5L * requested_records).<String>toArray(size -> new String[size])); return Arrays.<InputSplit>asList(combined); } else return tmp; })); logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null")); return splits; } catch (Throwable t) { logger.error(t); throw new IOException(t); } }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java
License:Apache License
/** * Create a single split from the list of blocks specified in validBlocks * Add this new split into splitList.//from www .jav a 2 s.co m */ private void addCreatedSplit(List<InputSplit> splitList, Collection<String> locations, ArrayList<OneBlockInfo> validBlocks) { // create an input split Path[] fl = new Path[validBlocks.size()]; long[] offset = new long[validBlocks.size()]; long[] length = new long[validBlocks.size()]; for (int i = 0; i < validBlocks.size(); i++) { fl[i] = validBlocks.get(i).onepath; offset[i] = validBlocks.get(i).offset; length[i] = validBlocks.get(i).length; } // add this split to the list that is returned CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0])); splitList.add(thissplit); }
From source file:com.ikanow.aleph2.analytics.r.assets.BeFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { logger.debug("BeFileInputFormat.getSplits"); super.setMaxSplitSize(MAX_SPLIT_SIZE); try {//from ww w . java 2 s.c o m final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> { final List<InputSplit> tmp = super.getSplits(context); String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE); if (null != debug_max_str) { final int requested_records = Integer.parseInt(debug_max_str); // dump 5* the request number of splits into one mega split // to strike a balance between limiting the data and making sure for // tests that enough records are generated final CombineFileSplit combined = new CombineFileSplit( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records) .<Path>toArray(size -> new Path[size]), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getLengths()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations()))) .limit(5L * requested_records).<String>toArray(size -> new String[size])); return Arrays.<InputSplit>asList(combined); } else return tmp; })); logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null")); return splits; } catch (Throwable t) { logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t)); return Collections.emptyList(); } }
From source file:com.ikanow.aleph2.analytics.spark.assets.BeFileInputFormat_Pure.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { logger.debug("BeFileInputFormat.getSplits"); super.setMaxSplitSize(MAX_SPLIT_SIZE); try {// w ww.j a va 2 s. c o m final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> { final List<InputSplit> tmp = super.getSplits(context); String debug_max_str = context.getConfiguration().get(HadoopBatchEnrichmentUtils.BE_DEBUG_MAX_SIZE); if (null != debug_max_str) { final int requested_records = Integer.parseInt(debug_max_str); // dump 5* the request number of splits into one mega split // to strike a balance between limiting the data and making sure for // tests that enough records are generated final CombineFileSplit combined = new CombineFileSplit( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records) .<Path>toArray(size -> new Path[size]), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getLengths()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations()))) .limit(5L * requested_records).<String>toArray(size -> new String[size])); return Arrays.<InputSplit>asList(combined); } else return tmp; })); logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null")); return splits; } catch (Throwable t) { logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t)); return Collections.emptyList(); } }
From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java
License:Apache License
/** * Create a single split from the list of blocks specified in validBlocks * Add this new split into splitList./*from www.j ava2 s.c o m*/ */ private void addCreatedSplit(List<InputSplit> splitList, List<String> locations, ArrayList<OneBlockInfo> validBlocks) { // create an input split Path[] fl = new Path[validBlocks.size()]; long[] offset = new long[validBlocks.size()]; long[] length = new long[validBlocks.size()]; for (int i = 0; i < validBlocks.size(); i++) { fl[i] = validBlocks.get(i).onepath; offset[i] = validBlocks.get(i).offset; length[i] = validBlocks.get(i).length; } // add this split to the list that is returned CombineFileSplit thissplit = new CombineFileSplit(fl, offset, length, locations.toArray(new String[0])); splitList.add(thissplit); }
From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java
License:Apache License
/** * Set the number of locations in the split to SPLIT_MAX_NUM_LOCATIONS if it is larger than * SPLIT_MAX_NUM_LOCATIONS (MAPREDUCE-5186). *///from w ww . ja v a2 s .c om private static List<InputSplit> cleanSplits(List<InputSplit> splits) throws IOException { if (VersionInfo.getVersion().compareTo("2.3.0") >= 0) { // This issue was fixed in 2.3.0, if newer version, no need to clean up splits return splits; } List<InputSplit> cleanedSplits = Lists.newArrayList(); for (int i = 0; i < splits.size(); i++) { CombineFileSplit oldSplit = (CombineFileSplit) splits.get(i); String[] locations = oldSplit.getLocations(); Preconditions.checkNotNull(locations, "CombineFileSplit.getLocations() returned null"); if (locations.length > SPLIT_MAX_NUM_LOCATIONS) { locations = Arrays.copyOf(locations, SPLIT_MAX_NUM_LOCATIONS); } cleanedSplits.add(new CombineFileSplit(oldSplit.getPaths(), oldSplit.getStartOffsets(), oldSplit.getLengths(), locations)); } return cleanedSplits; }