List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.scaleoutsoftware.soss.hserver.GridInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { int appId = jobContext.getConfiguration().getInt(inputAppIdProperty, 0); int suggestedNumberOfSplits = getSuggestedNumberOfSplits(jobContext); return getSplits(appId, suggestedNumberOfSplits); }
From source file:com.scaleoutsoftware.soss.hserver.GridInputFormat.java
License:Apache License
/** * Gets the number of input splits. First, tries the corresponding property, * then falls back to the number of available slots. * * @param context job context//w w w . ja v a 2s . c o m * @return number of input splits */ private int getSuggestedNumberOfSplits(JobContext context) throws IOException { int numberOfSplits; Configuration conf = context.getConfiguration(); numberOfSplits = conf.getInt(inputNumberOfSplitsProperty, -1); if (numberOfSplits > 0) return numberOfSplits; if (HServerParameters.isHServerJob(context.getConfiguration())) { //We are running a hServer job, not a Hadoop job return HSERVER_JOB_DEFAULT_NUMBER_OF_SPLITS; } try { ClusterStatus status = (new JobClient((JobConf) context.getConfiguration())).getClusterStatus(); numberOfSplits = status.getMaxMapTasks() - status.getMapTasks(); if (numberOfSplits > 0) return numberOfSplits; } catch (Throwable t) { //Do nothing, will fall back to default; } return DEFAULT_NUMBER_OF_SPLITS; }
From source file:com.sourcecode.FileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. //from w w w. ja v a 2 s . c o m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); List<FileStatus> result = null; int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS); Stopwatch sw = new Stopwatch().start(); if (numThreads == 1) { result = singleThreadedListStatus(job, dirs, inputFilter, recursive); } else { Iterable<FileStatus> locatedFiles = null; try { LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( job.getConfiguration(), dirs, recursive, inputFilter, true); locatedFiles = locatedFileStatusFetcher.getFileStatuses(); } catch (InterruptedException e) { throw new IOException("Interrupted while getting file statuses"); } result = Lists.newArrayList(locatedFiles); } sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis()); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.sourcecode.FileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context//from w w w . ja va2 s . c om * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { Stopwatch sw = new Stopwatch().start(); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { FileSystem fs = path.getFileSystem(job.getConfiguration()); blkLocations = fs.getFileBlockLocations(file, 0, length); } if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.elapsedMillis()); } return splits; }
From source file:com.splicemachine.mrio.api.core.AbstractSMInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { setConf(context.getConfiguration()); if (LOG.isDebugEnabled()) SpliceLogUtils.debug(LOG, "getSplits with context=%s", context); Scan s;//from w w w .j a v a2 s. c o m try { TableScannerBuilder tsb = TableScannerBuilder .getTableScannerBuilderFromBase64String(conf.get(MRConstants.SPLICE_SCAN_INFO)); s = ((HScan) tsb.getScan()).unwrapDelegate(); } catch (StandardException e) { SpliceLogUtils.error(LOG, e); throw new IOException(e); } SIDriver driver = SIDriver.driver(); HBaseConnectionFactory instance = HBaseConnectionFactory.getInstance(driver.getConfiguration()); Clock clock = driver.getClock(); Connection connection = instance.getConnection(); Partition clientPartition = new ClientPartition(connection, table.getName(), table, clock, driver.getPartitionInfoCache()); int retryCounter = 0; boolean refresh = false; while (true) { try { List<Partition> splits = clientPartition.subPartitions(s.getStartRow(), s.getStopRow(), refresh); if (oneSplitPerRegion(conf)) return toSMSplits(splits); if (LOG.isDebugEnabled()) { SpliceLogUtils.debug(LOG, "getSplits " + splits); for (Partition split : splits) { SpliceLogUtils.debug(LOG, "split -> " + split); } } SubregionSplitter splitter = new HBaseSubregionSplitter(); List<InputSplit> results = splitter.getSubSplits(table, splits, s.getStartRow(), s.getStopRow()); return results; } catch (HMissedSplitException e) { // retry; refresh = true; LOG.warn("Missed split computing subtasks for region " + clientPartition); retryCounter++; if (retryCounter > MAX_RETRIES) { throw e; } } } }
From source file:com.splicemachine.orc.input.SpliceOrcNewInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { Configuration configuration = jobContext.getConfiguration(); // Filter Based On Partition Logic List<InputSplit> inputSplits = SpliceOrcUtils.getSplits(jobContext); final List<Integer> partitions = getReadColumnIDs(SPLICE_PARTITIONS, jobContext.getConfiguration()); final List<Integer> columns = getReadColumnIDs(SPLICE_COLUMNS, jobContext.getConfiguration()); final StructType structType = getRowStruct(configuration); final SpliceORCPredicate orcPredicate = getSplicePredicate(configuration); try {//from ww w . j a va 2 s .c o m // Predicate Pruning... return Lists.newArrayList(Iterables.filter(inputSplits, new Predicate<InputSplit>() { @Override public boolean apply(@Nullable InputSplit s) { try { List<String> values = Warehouse .getPartValuesFromPartName(((OrcNewSplit) s).getPath().toString()); Map<Integer, ColumnStatistics> columnStatisticsMap = SpliceORCPredicate.partitionStatsEval( columns, structType, partitions, values.toArray(new String[values.size()])); return orcPredicate.matches(10000, columnStatisticsMap); } catch (Exception e) { throw new RuntimeException(e); } } })); } catch (Exception e) { throw new IOException(e); } }
From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { return ImmutableList.<InputSplit>of( new FileSplit(jobContext.getConfiguration().get(AvroConversionCommonConstants.INPUT_FILE), jobContext.getConfiguration().get(AvroConversionCommonConstants.OUTPUT_DIR))); }
From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroparquet.AvroParquetInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { return ImmutableList .<InputSplit>of(new FileSplit(jobContext.getConfiguration().get(AvroParquetConstants.INPUT_FILE), jobContext.getConfiguration().get(AvroParquetConstants.OUTPUT_DIR))); }
From source file:com.streamsets.pipeline.stage.destination.SimpleTestInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { Configuration conf = jobContext.getConfiguration(); if (conf.getBoolean(THROW_EXCEPTION, false)) { throw new IOException("Throwing exception as instructed, failure in bootstraping MR job."); }/* www .jav a2s .com*/ String fileLocation = conf.get(FILE_LOCATION); if (fileLocation != null) { FileUtils.writeStringToFile(new File(fileLocation), conf.get(FILE_VALUE)); } return Collections.emptyList(); }
From source file:com.telefonica.iot.tidoop.apiext.hadoop.ckan.CKANInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) { // create a CKAN backend String ckanHost = job.getConfiguration().get(INPUT_CKAN_HOST); String ckanPort = job.getConfiguration().get(INPUT_CKAN_PORT); boolean ckanSSL = job.getConfiguration().get(INPUT_CKAN_SSL).equals("true"); String ckanAPIKey = job.getConfiguration().get(INPUT_CKAN_API_KEY); int splitsLength = new Integer(job.getConfiguration().get(INPUT_CKAN_SPLITS_LENGTH)); logger.info("Getting splits, the backend is at " + (ckanSSL ? "https://" : "http://") + ckanHost + ":" + ckanPort + " (API key=" + ckanAPIKey + ")"); if (backend == null) { backend = new CKANBackend(ckanHost, ckanPort, ckanSSL, ckanAPIKey, splitsLength); } // if// ww w . j a v a 2s.c om // resulting splits container List<InputSplit> splits = new ArrayList<InputSplit>(); // get the Job configuration Configuration conf = job.getConfiguration(); // get the inputs, i.e. the list of CKAN URLs String input = conf.get(INPUT_CKAN_URLS, ""); String[] ckanURLs = StringUtils.split(input); // iterate on the CKAN URLs, they may be related to whole organizations, packages/datasets or specific resources for (String ckanURL : ckanURLs) { if (isCKANOrg(ckanURL)) { logger.info("Getting splits for " + ckanURL + ", it is an organization"); splits.addAll(getSplitsOrg(ckanURL, job.getConfiguration())); } else if (isCKANPkg(ckanURL)) { logger.info("Getting splits for " + ckanURL + ", it is a package/dataset"); splits.addAll(getSplitsPkg(ckanURL, job.getConfiguration())); } else { logger.info("Getting splits for " + ckanURL + ", it is a resource"); splits.addAll(getSplitsRes(ckanURL, job.getConfiguration())); } // if else if } // for // return the splits logger.info("Number of total splits=" + splits.size()); return splits; }