List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java
License:Apache License
/** * Gets the high water mark value.//from w w w. j a v a 2 s . c o m */ public static long getHighWaterMark(@Nonnull final JobContext job) { return job.getConfiguration().getLong(HIGH_WATER_MARK, Long.MIN_VALUE); }
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java
License:Apache License
/** * Gets the maximum file age.// w w w . j a va 2 s . co m */ public static long getMaxFileAge(@Nonnull final JobContext job) { return job.getConfiguration().getLong(MAX_FILE_AGE, Long.MAX_VALUE); }
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java
License:Apache License
/** * Gets the minimum file age./*from w w w. ja v a2 s .c o m*/ */ public static long getMinFileAge(@Nonnull final JobContext job) { return job.getConfiguration().getLong(MIN_FILE_AGE, Long.MIN_VALUE); }
From source file:com.topsoft.botspider.avro.mapreduce.output.ExtFileOutputFormat.java
License:Apache License
/** * Set the base output name for output file to be created. *///ww w .j a v a2s. c o m public static void setOutputName(JobContext job, String name) { job.getConfiguration().set(BASE_OUTPUT_NAME, name); }
From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.GSRowInputFormat.java
License:Apache License
/** * <div lang="ja">//from w w w . j av a2s . c o m * GridDB?InputSplit???????<br/> * InputSplit?????mapreduce.job.maps? ????????? * @param context JobContext * @throws GSException GridDB?????? * </div><div lang="en"> * Generate a list of GridDB InputSplit objects.<br/> * The number of InputSplits will be the smaller of the number of partitions for input processing and the value of property mapreduce.job.maps. * @param context JobContext object * @throws GSException an exception occurred in GridDB * </div> */ @Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); int numSplits = conf.getInt("mapreduce.job.maps", 1); GDInputFormat inputFormat = new GDInputFormat(); List<InputSplit> splits = inputFormat.getSplitList(numSplits, conf); return splits; }
From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;// ww w. j a v a 2 s. co m } else { minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapred.max.split.size", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { //Path p = new Path(paths[i].toUri().getPath()); Path p = paths[i]; newpaths.add(p); } paths = null; // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } // create splits for all files in this pool. getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }
From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java
License:Apache License
/** * Return all the splits in the specified set of paths *///from w ww. j ava 2 s . com private void getMoreSplits(JobContext job, Path[] paths, long maxSize, long minSizeNode, long minSizeRack, List<InputSplit> splits) throws IOException { Configuration conf = job.getConfiguration(); // all blocks for all the files in input set OneFileInfo[] files; // mapping from a rack name to the list of blocks it has HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>(); // mapping from a block to the nodes on which it has replicas HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>(); // mapping from a node to the list of blocks that it contains HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>(); files = new OneFileInfo[paths.length]; if (paths.length == 0) { return; } // populate all the blocks for all files long totLength = 0; for (int i = 0; i < paths.length; i++) { files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]), rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes, maxSize); totLength += files[i].getLength(); } ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>(); Set<String> nodes = new HashSet<String>(); long curSplitSize = 0; // process all nodes and create splits that are local // to a node. for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter .hasNext();) { Map.Entry<String, List<OneBlockInfo>> one = iter.next(); nodes.add(one.getKey()); List<OneBlockInfo> blocksInNode = one.getValue(); // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. for (OneBlockInfo oneblock : blocksInNode) { if (blockToNodes.containsKey(oneblock)) { validBlocks.add(oneblock); blockToNodes.remove(oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, nodes, validBlocks); curSplitSize = 0; validBlocks.clear(); } } } // if there were any blocks left over and their combined size is // larger than minSplitNode, then combine them into one split. // Otherwise add them back to the unprocessed pool. It is likely // that they will be combined with other blocks from the // same rack later on. if (minSizeNode != 0 && curSplitSize >= minSizeNode) { // create an input split and add it to the splits array addCreatedSplit(splits, nodes, validBlocks); } else { for (OneBlockInfo oneblock : validBlocks) { blockToNodes.put(oneblock, oneblock.hosts); } } validBlocks.clear(); nodes.clear(); curSplitSize = 0; } // if blocks in a rack are below the specified minimum size, then keep them // in 'overflow'. After the processing of all racks is complete, these // overflow blocks will be combined into splits. ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>(); Set<String> racks = new HashSet<String>(); // Process all racks over and over again until there is no more work to do. while (blockToNodes.size() > 0) { // Create one split for this rack before moving over to the next rack. // Come back to this rack after creating a single split for each of the // remaining racks. // Process one rack location at a time, Combine all possible blocks that // reside on this rack as one split. (constrained by minimum and maximum // split size). // iterate over all racks for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter .hasNext();) { Map.Entry<String, List<OneBlockInfo>> one = iter.next(); racks.add(one.getKey()); List<OneBlockInfo> blocks = one.getValue(); // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. boolean createdSplit = false; for (OneBlockInfo oneblock : blocks) { if (blockToNodes.containsKey(oneblock)) { validBlocks.add(oneblock); blockToNodes.remove(oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, getHosts(racks), validBlocks); createdSplit = true; break; } } } // if we created a split, then just go to the next rack if (createdSplit) { curSplitSize = 0; validBlocks.clear(); racks.clear(); continue; } if (!validBlocks.isEmpty()) { if (minSizeRack != 0 && curSplitSize >= minSizeRack) { // if there is a minimum size specified, then create a single split // otherwise, store these blocks into overflow data structure addCreatedSplit(splits, getHosts(racks), validBlocks); } else { // There were a few blocks in this rack that // remained to be processed. Keep them in 'overflow' block list. // These will be combined later. overflowBlocks.addAll(validBlocks); } } curSplitSize = 0; validBlocks.clear(); racks.clear(); } } assert blockToNodes.isEmpty(); assert curSplitSize == 0; assert validBlocks.isEmpty(); assert racks.isEmpty(); // Process all overflow blocks for (OneBlockInfo oneblock : overflowBlocks) { validBlocks.add(oneblock); curSplitSize += oneblock.length; // This might cause an exiting rack location to be re-added, // but it should be ok. for (int i = 0; i < oneblock.racks.length; i++) { racks.add(oneblock.racks[i]); } // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array addCreatedSplit(splits, getHosts(racks), validBlocks); curSplitSize = 0; validBlocks.clear(); racks.clear(); } } // Process any remaining blocks, if any. if (!validBlocks.isEmpty()) { addCreatedSplit(splits, getHosts(racks), validBlocks); } }
From source file:com.twitter.algebra.matrix.format.MatrixOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { if (baseOut == null) { getBaseOutputFormat(context.getConfiguration()); }//from ww w . j a v a2s . c o m super.checkOutputSpecs(context); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * Go through each original inputsplit, get its file path, and check the * index file,/*from www .j ava 2 s . c o m*/ * a) keep it, when there is no index prebuilt on this file * (or the index file doesn't match with the base file's checksum; * b) remove it when no matching value is found in existing index file; * c) construct new smaller inputsplits using indexed blocks found * in the index file; */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { String inputformat = job.getConfiguration().get(REALINPUTFORMAT); String valueClass = job.getConfiguration().get(VALUECLASS); List<InputSplit> filteredList = new ArrayList<InputSplit>(); FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass); List<InputSplit> splits = realInputFormat.getSplits(job); //if indexing jobs, don't skip any input splits. //if searching job but no searching filter, skip the index as well. if (isIndexingJob(job) || getFilterCondition(job) == null) return splits; Path prevFile = null; // remember the last input file we saw boolean foundIndexedFile = false; // is there a index file for // prevFile? boolean firstTime = true; // is this the first time we see this file? long totalOriginalBytes = 0; //the bytes to be scanned without indexes. totalBytesNewSplits = 0; long startTime = System.currentTimeMillis(); LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes"); Configuration conf = job.getConfiguration(); long splitMaxSize; // for each original input split check if we can filter it out. for (InputSplit split : splits) { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); splitLength = fileSplit.getLength(); totalOriginalBytes += fileSplit.getLength(); splitMaxSize = Math.max(splitLength, conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024))); /* * for each new file we see, we first check if it has been indexed or not; * if not, we just add the original input split; if yes, we use the index * file to add filtered splits for the file */ if (prevFile != null && path.equals(prevFile)) { firstTime = false; } else { prevFile = path; firstTime = true; foundIndexedFile = foundIndexFile(job, path); } // if no index file, we'll have to read all original input // splits if (!foundIndexedFile) filteredList.add(fileSplit); else { // for each file we only add once its filtered input splits using index // file if (firstTime) { // LOG.info("first time saw " + path // + ", adding filtered splits from index file"); filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize)); } } } long endTime = System.currentTimeMillis(); LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: " + (endTime - startTime) / 1000); LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s", totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits))); return filteredList; }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param context// w w w.j a v a 2s . co m * @return the index directory provided to the job */ public static String getIndexDir(JobContext context) { return context.getConfiguration().get(INDEXDIR); }