Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.scaleoutsoftware.soss.hserver.GridInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    int appId = jobContext.getConfiguration().getInt(inputAppIdProperty, 0);
    int suggestedNumberOfSplits = getSuggestedNumberOfSplits(jobContext);
    return getSplits(appId, suggestedNumberOfSplits);
}

From source file:com.scaleoutsoftware.soss.hserver.GridInputFormat.java

License:Apache License

/**
 * Gets the number of input splits. First, tries the corresponding property,
 * then falls back to the number of available slots.
 *
 * @param context job context//w  w w  . ja v a 2s  . c  o  m
 * @return number of input splits
 */
private int getSuggestedNumberOfSplits(JobContext context) throws IOException {
    int numberOfSplits;
    Configuration conf = context.getConfiguration();
    numberOfSplits = conf.getInt(inputNumberOfSplitsProperty, -1);
    if (numberOfSplits > 0)
        return numberOfSplits;
    if (HServerParameters.isHServerJob(context.getConfiguration())) { //We are running a hServer job, not a Hadoop job
        return HSERVER_JOB_DEFAULT_NUMBER_OF_SPLITS;
    }
    try {
        ClusterStatus status = (new JobClient((JobConf) context.getConfiguration())).getClusterStatus();
        numberOfSplits = status.getMaxMapTasks() - status.getMapTasks();
        if (numberOfSplits > 0)
            return numberOfSplits;
    } catch (Throwable t) {
        //Do nothing, will fall back to default;
    }
    return DEFAULT_NUMBER_OF_SPLITS;
}

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. //from  w w  w. ja v  a 2 s . c o m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    List<FileStatus> result = null;

    int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
    Stopwatch sw = new Stopwatch().start();
    if (numThreads == 1) {
        result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
    } else {
        Iterable<FileStatus> locatedFiles = null;
        try {
            LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(
                    job.getConfiguration(), dirs, recursive, inputFilter, true);
            locatedFiles = locatedFileStatusFetcher.getFileStatuses();
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while getting file statuses");
        }
        result = Lists.newArrayList(locatedFiles);
    }

    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis());
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context//from w  w w  .  ja  va2 s  . c  om
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = new Stopwatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsedMillis());
    }
    return splits;
}

From source file:com.splicemachine.mrio.api.core.AbstractSMInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    setConf(context.getConfiguration());
    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG, "getSplits with context=%s", context);
    Scan s;//from   w  w  w  .j  a v a2  s. c o m
    try {
        TableScannerBuilder tsb = TableScannerBuilder
                .getTableScannerBuilderFromBase64String(conf.get(MRConstants.SPLICE_SCAN_INFO));
        s = ((HScan) tsb.getScan()).unwrapDelegate();
    } catch (StandardException e) {
        SpliceLogUtils.error(LOG, e);
        throw new IOException(e);
    }
    SIDriver driver = SIDriver.driver();
    HBaseConnectionFactory instance = HBaseConnectionFactory.getInstance(driver.getConfiguration());
    Clock clock = driver.getClock();
    Connection connection = instance.getConnection();
    Partition clientPartition = new ClientPartition(connection, table.getName(), table, clock,
            driver.getPartitionInfoCache());
    int retryCounter = 0;
    boolean refresh = false;
    while (true) {
        try {
            List<Partition> splits = clientPartition.subPartitions(s.getStartRow(), s.getStopRow(), refresh);

            if (oneSplitPerRegion(conf))
                return toSMSplits(splits);
            if (LOG.isDebugEnabled()) {
                SpliceLogUtils.debug(LOG, "getSplits " + splits);
                for (Partition split : splits) {
                    SpliceLogUtils.debug(LOG, "split -> " + split);
                }
            }
            SubregionSplitter splitter = new HBaseSubregionSplitter();
            List<InputSplit> results = splitter.getSubSplits(table, splits, s.getStartRow(), s.getStopRow());

            return results;
        } catch (HMissedSplitException e) {
            // retry;
            refresh = true;
            LOG.warn("Missed split computing subtasks for region " + clientPartition);
            retryCounter++;
            if (retryCounter > MAX_RETRIES) {
                throw e;
            }
        }
    }
}

From source file:com.splicemachine.orc.input.SpliceOrcNewInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration configuration = jobContext.getConfiguration();

    // Filter Based On Partition Logic
    List<InputSplit> inputSplits = SpliceOrcUtils.getSplits(jobContext);
    final List<Integer> partitions = getReadColumnIDs(SPLICE_PARTITIONS, jobContext.getConfiguration());
    final List<Integer> columns = getReadColumnIDs(SPLICE_COLUMNS, jobContext.getConfiguration());
    final StructType structType = getRowStruct(configuration);
    final SpliceORCPredicate orcPredicate = getSplicePredicate(configuration);

    try {//from   ww w . j  a  va  2 s  .c o m
        // Predicate Pruning...
        return Lists.newArrayList(Iterables.filter(inputSplits, new Predicate<InputSplit>() {
            @Override
            public boolean apply(@Nullable InputSplit s) {
                try {
                    List<String> values = Warehouse
                            .getPartValuesFromPartName(((OrcNewSplit) s).getPath().toString());
                    Map<Integer, ColumnStatistics> columnStatisticsMap = SpliceORCPredicate.partitionStatsEval(
                            columns, structType, partitions, values.toArray(new String[values.size()]));
                    return orcPredicate.matches(10000, columnStatisticsMap);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }
        }));
    } catch (Exception e) {
        throw new IOException(e);
    }

}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    return ImmutableList.<InputSplit>of(
            new FileSplit(jobContext.getConfiguration().get(AvroConversionCommonConstants.INPUT_FILE),
                    jobContext.getConfiguration().get(AvroConversionCommonConstants.OUTPUT_DIR)));
}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroparquet.AvroParquetInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    return ImmutableList
            .<InputSplit>of(new FileSplit(jobContext.getConfiguration().get(AvroParquetConstants.INPUT_FILE),
                    jobContext.getConfiguration().get(AvroParquetConstants.OUTPUT_DIR)));
}

From source file:com.streamsets.pipeline.stage.destination.SimpleTestInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();

    if (conf.getBoolean(THROW_EXCEPTION, false)) {
        throw new IOException("Throwing exception as instructed, failure in bootstraping MR job.");
    }/*  www  .jav  a2s  .com*/

    String fileLocation = conf.get(FILE_LOCATION);
    if (fileLocation != null) {
        FileUtils.writeStringToFile(new File(fileLocation), conf.get(FILE_VALUE));
    }

    return Collections.emptyList();
}

From source file:com.telefonica.iot.tidoop.apiext.hadoop.ckan.CKANInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) {
    // create a CKAN backend
    String ckanHost = job.getConfiguration().get(INPUT_CKAN_HOST);
    String ckanPort = job.getConfiguration().get(INPUT_CKAN_PORT);
    boolean ckanSSL = job.getConfiguration().get(INPUT_CKAN_SSL).equals("true");
    String ckanAPIKey = job.getConfiguration().get(INPUT_CKAN_API_KEY);
    int splitsLength = new Integer(job.getConfiguration().get(INPUT_CKAN_SPLITS_LENGTH));
    logger.info("Getting splits, the backend is at " + (ckanSSL ? "https://" : "http://") + ckanHost + ":"
            + ckanPort + " (API key=" + ckanAPIKey + ")");

    if (backend == null) {
        backend = new CKANBackend(ckanHost, ckanPort, ckanSSL, ckanAPIKey, splitsLength);
    } // if//  ww  w  . j a v  a  2s.c  om

    // resulting splits container
    List<InputSplit> splits = new ArrayList<InputSplit>();

    // get the Job configuration
    Configuration conf = job.getConfiguration();

    // get the inputs, i.e. the list of CKAN URLs
    String input = conf.get(INPUT_CKAN_URLS, "");
    String[] ckanURLs = StringUtils.split(input);

    // iterate on the CKAN URLs, they may be related to whole organizations, packages/datasets or specific resources
    for (String ckanURL : ckanURLs) {
        if (isCKANOrg(ckanURL)) {
            logger.info("Getting splits for " + ckanURL + ", it is an organization");
            splits.addAll(getSplitsOrg(ckanURL, job.getConfiguration()));
        } else if (isCKANPkg(ckanURL)) {
            logger.info("Getting splits for " + ckanURL + ", it is a package/dataset");
            splits.addAll(getSplitsPkg(ckanURL, job.getConfiguration()));
        } else {
            logger.info("Getting splits for " + ckanURL + ", it is a resource");
            splits.addAll(getSplitsRes(ckanURL, job.getConfiguration()));
        } // if else if
    } // for

    // return the splits
    logger.info("Number of total splits=" + splits.size());
    return splits;
}