Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.scaleoutsoftware.soss.hserver.GridInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    int appId = jobContext.getConfiguration().getInt(inputAppIdProperty, 0);
    int suggestedNumberOfSplits = getSuggestedNumberOfSplits(jobContext);
    return getSplits(appId, suggestedNumberOfSplits);
}

From source file:com.scaleoutsoftware.soss.hserver.GridInputFormat.java

License:Apache License

/**
 * Gets the number of input splits. First, tries the corresponding property,
 * then falls back to the number of available slots.
 *
 * @param context job context//w  w w  . ja v a 2s  . c  o  m
 * @return number of input splits
 */
private int getSuggestedNumberOfSplits(JobContext context) throws IOException {
    int numberOfSplits;
    Configuration conf = context.getConfiguration();
    numberOfSplits = conf.getInt(inputNumberOfSplitsProperty, -1);
    if (numberOfSplits > 0)
        return numberOfSplits;
    if (HServerParameters.isHServerJob(context.getConfiguration())) { //We are running a hServer job, not a Hadoop job
        return HSERVER_JOB_DEFAULT_NUMBER_OF_SPLITS;
    }
    try {
        ClusterStatus status = (new JobClient((JobConf) context.getConfiguration())).getClusterStatus();
        numberOfSplits = status.getMaxMapTasks() - status.getMapTasks();
        if (numberOfSplits > 0)
            return numberOfSplits;
    } catch (Throwable t) {
        //Do nothing, will fall back to default;
    }
    return DEFAULT_NUMBER_OF_SPLITS;
}

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. //from  w w  w. ja v  a 2 s . c o m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    List<FileStatus> result = null;

    int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
    Stopwatch sw = new Stopwatch().start();
    if (numThreads == 1) {
        result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
    } else {
        Iterable<FileStatus> locatedFiles = null;
        try {
            LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(
                    job.getConfiguration(), dirs, recursive, inputFilter, true);
            locatedFiles = locatedFileStatusFetcher.getFileStatuses();
        } catch (InterruptedException e) {
            throw new IOException("Interrupted while getting file statuses");
        }
        result = Lists.newArrayList(locatedFiles);
    }

    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis());
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.sourcecode.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context//from w  w w  .  ja  va2 s  . c  om
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = new Stopwatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsedMillis());
    }
    return splits;
}

From source file:com.splicemachine.mrio.api.core.AbstractSMInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    setConf(context.getConfiguration());
    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG, "getSplits with context=%s", context);
    Scan s;//from   w  w  w  .j  a v a2  s. c o m
    try {
        TableScannerBuilder tsb = TableScannerBuilder
                .getTableScannerBuilderFromBase64String(conf.get(MRConstants.SPLICE_SCAN_INFO));
        s = ((HScan) tsb.getScan()).unwrapDelegate();
    } catch (StandardException e) {
        SpliceLogUtils.error(LOG, e);
        throw new IOException(e);
    }
    SIDriver driver = SIDriver.driver();
    HBaseConnectionFactory instance = HBaseConnectionFactory.getInstance(driver.getConfiguration());
    Clock clock = driver.getClock();
    Connection connection = instance.getConnection();
    Partition clientPartition = new ClientPartition(connection, table.getName(), table, clock,
            driver.getPartitionInfoCache());
    int retryCounter = 0;
    boolean refresh = false;
    while (true) {
        try {
            List<Partition> splits = clientPartition.subPartitions(s.getStartRow(), s.getStopRow(), refresh);

            if (oneSplitPerRegion(conf))
                return toSMSplits(splits);
            if (LOG.isDebugEnabled()) {
                SpliceLogUtils.debug(LOG, "getSplits " + splits);
                for (Partition split : splits) {
                    SpliceLogUtils.debug(LOG, "split -> " + split);
                }
            }
            SubregionSplitter splitter = new HBaseSubregionSplitter();
            List<InputSplit> results = splitter.getSubSplits(table, splits, s.getStartRow(), s.getStopRow());

            return results;
        } catch (HMissedSplitException e) {
            // retry;
            refresh = true;
            LOG.warn("Missed split computing subtasks for region " + clientPartition);
            retryCounter++;
            if (retryCounter > MAX_RETRIES) {
                throw e;
            }
        }
    }
}

From source file:com.splicemachine.orc.input.SpliceOrcNewInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration configuration = jobContext.getConfiguration();

    // Filter Based On Partition Logic
    List<InputSplit> inputSplits = SpliceOrcUtils.getSplits(jobContext);
    final List<Integer> partitions = getReadColumnIDs(SPLICE_PARTITIONS, jobContext.getConfiguration());
    final List<Integer> columns = getReadColumnIDs(SPLICE_COLUMNS, jobContext.getConfiguration());
    final StructType structType = getRowStruct(configuration);
    final SpliceORCPredicate orcPredicate = getSplicePredicate(configuration);

    try {//from   ww w . j  a  va  2 s  .c o m
        // Predicate Pruning...
        return Lists.newArrayList(Iterables.filter(inputSplits, new Predicate<InputSplit>() {
            @Override
            public boolean apply(@Nullable InputSplit s) {
                try {
                    List<String> values = Warehouse
                            .getPartValuesFromPartName(((OrcNewSplit) s).getPath().toString());
                    Map<Integer, ColumnStatistics> columnStatisticsMap = SpliceORCPredicate.partitionStatsEval(
                            columns, structType, partitions, values.toArray(new String[values.size()]));
                    return orcPredicate.matches(10000, columnStatisticsMap);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }
        }));
    } catch (Exception e) {
        throw new IOException(e);
    }

}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    return ImmutableList.<InputSplit>of(
            new FileSplit(jobContext.getConfiguration().get(AvroConversionCommonConstants.INPUT_FILE),
                    jobContext.getConfiguration().get(AvroConversionCommonConstants.OUTPUT_DIR)));
}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroparquet.AvroParquetInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    return ImmutableList
            .<InputSplit>of(new FileSplit(jobContext.getConfiguration().get(AvroParquetConstants.INPUT_FILE),
                    jobContext.getConfiguration().get(AvroParquetConstants.OUTPUT_DIR)));
}

From source file:com.streamsets.pipeline.stage.destination.SimpleTestInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();

    if (conf.getBoolean(THROW_EXCEPTION, false)) {
        throw new IOException("Throwing exception as instructed, failure in bootstraping MR job.");
    }/*  www  .jav  a2s  .com*/

    String fileLocation = conf.get(FILE_LOCATION);
    if (fileLocation != null) {
        FileUtils.writeStringToFile(new File(fileLocation), conf.get(FILE_VALUE));
    }

    return Collections.emptyList();
}

From source file:com.telefonica.iot.tidoop.apiext.hadoop.ckan.CKANInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) {
    // create a CKAN backend
    String ckanHost = job.getConfiguration().get(INPUT_CKAN_HOST);
    String ckanPort = job.getConfiguration().get(INPUT_CKAN_PORT);
    boolean ckanSSL = job.getConfiguration().get(INPUT_CKAN_SSL).equals("true");
    String ckanAPIKey = job.getConfiguration().get(INPUT_CKAN_API_KEY);
    int splitsLength = new Integer(job.getConfiguration().get(INPUT_CKAN_SPLITS_LENGTH));
    logger.info("Getting splits, the backend is at " + (ckanSSL ? "https://" : "http://") + ckanHost + ":"
            + ckanPort + " (API key=" + ckanAPIKey + ")");

    if (backend == null) {
        backend = new CKANBackend(ckanHost, ckanPort, ckanSSL, ckanAPIKey, splitsLength);
    } // if//  ww  w  . j a v  a  2s.c  om

    // resulting splits container
    List<InputSplit> splits = new ArrayList<InputSplit>();

    // get the Job configuration
    Configuration conf = job.getConfiguration();

    // get the inputs, i.e. the list of CKAN URLs
    String input = conf.get(INPUT_CKAN_URLS, "");
    String[] ckanURLs = StringUtils.split(input);

    // iterate on the CKAN URLs, they may be related to whole organizations, packages/datasets or specific resources
    for (String ckanURL : ckanURLs) {
        if (isCKANOrg(ckanURL)) {
            logger.info("Getting splits for " + ckanURL + ", it is an organization");
            splits.addAll(getSplitsOrg(ckanURL, job.getConfiguration()));
        } else if (isCKANPkg(ckanURL)) {
            logger.info("Getting splits for " + ckanURL + ", it is a package/dataset");
            splits.addAll(getSplitsPkg(ckanURL, job.getConfiguration()));
        } else {
            logger.info("Getting splits for " + ckanURL + ", it is a resource");
            splits.addAll(getSplitsRes(ckanURL, job.getConfiguration()));
        } // if else if
    } // for

    // return the splits
    logger.info("Number of total splits=" + splits.size());
    return splits;
}