Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.ery.hadoop.mrddx.hbase.HbaseInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {

    String startRow = this.dbConf.getInputHBaseQueryStartRow();
    String stopRow = this.dbConf.getInputHBaseQueryStopRow();
    String tableName = this.dbConf.getInputTableName();

    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<HRegionLocation> lstHRegionLocation = getTableHRegionInfo(job.getConfiguration(), tableName, startRow,
            stopRow);/*w w w  .j av a  2  s. c  om*/
    for (int i = 0; i < lstHRegionLocation.size(); i++) {
        HRegionLocation hRegionLocation = lstHRegionLocation.get(i);
        String tempStart = null;
        String tempEnd = null;
        HRegionInfo hRegionInfo = hRegionLocation.getRegionInfo();
        // NO.1?
        if (null == startRow && null == stopRow) {
            tempStart = new String(hRegionInfo.getStartKey());
            tempEnd = new String(hRegionInfo.getEndKey());
            HbaseInputSplit split = new HbaseInputSplit(tempStart, tempEnd);
            splits.add(split);
            continue;
        }

        // NO.2?
        byte[] startKeyByte = hRegionInfo.getStartKey();
        byte[] endKeyByte = hRegionInfo.getEndKey();
        if (null != startRow && hRegionInfo.containsRow(startRow.getBytes())) {
            tempStart = startRow;
        }

        if (null != stopRow && hRegionInfo.containsRow(stopRow.getBytes())) {
            tempEnd = stopRow;
        }

        tempStart = tempStart != null ? tempStart : new String(startKeyByte);
        tempEnd = tempEnd != null ? tempEnd : new String(endKeyByte);
        HbaseInputSplit split = new HbaseInputSplit(tempStart, tempEnd);
        splits.add(split);
    }

    MRLog.info(LOG, "Finished hbase split!");
    return splits;
}

From source file:com.facebook.hiveio.input.HiveApiInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    HiveInputDescription inputDesc = readProfileInputDesc(conf);

    ThriftHiveMetastore.Iface client;//from  ww  w  .  ja  v  a  2 s  .  c  o m
    try {
        client = inputDesc.metastoreClient(conf);
    } catch (TException e) {
        throw new IOException(e);
    }

    return getSplits(conf, inputDesc, client);
}

From source file:com.facebook.hiveio.output.HiveApiOutputCommitter.java

License:Apache License

@Override
public void commitJob(JobContext jobContext) throws IOException {
    baseCommitter.commitJob(jobContext);

    Configuration conf = jobContext.getConfiguration();
    OutputConf outputConf = new OutputConf(conf, profileId);
    HiveOutputDescription outputDesc = outputConf.readOutputDescription();
    OutputInfo outputInfo = outputConf.readOutputTableInfo();
    if (outputInfo.hasPartitionInfo()) {
        registerPartitions(conf, outputDesc, outputInfo);
    } else {//from   w w  w  . j a  v a 2s. com
        noPartitionsCopyData(conf, outputInfo);
    }

    writeSuccessFile(conf);
}

From source file:com.facebook.hiveio.output.HiveApiOutputCommitter.java

License:Apache License

@Override
public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
    baseCommitter.abortJob(jobContext, state);
    HadoopUtils.deleteOutputDir(jobContext.getConfiguration());
}

From source file:com.facebook.hiveio.output.HiveApiOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    OutputConf outputConf = new OutputConf(conf, myProfileId);

    HiveOutputDescription description = outputConf.readOutputDescription();
    OutputInfo oti = outputConf.readOutputTableInfo();
    LOG.info("Check output specs of " + description);

    if (description == null) {
        LOG.error("HiveOutputDescription is null in Configuration, nothing to check");
        return;//from  w w w. j  ava2  s .c o  m
    }
    checkTableExists(conf, description);

    if (oti == null) {
        LOG.error("OutputInfo is null in Configuration, nothing to check");
        return;
    }
    checkPartitionInfo(conf, description, oti, outputConf);
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    System.out.println("Calculating Job Splits...");

    conf = context.getConfiguration();
    dcmConfig = MirrorUtils.getConfigFromConf(conf);

    Set<String> excludeList = getExclusionsFileList(conf);
    Set<String> includeList = getInclusionFileList(conf);

    HashMap<String, FileTuple> inputFileMap = new HashMap<String, FileTuple>();
    Map<String, TransferStatus> previousState = null;

    List<FileTuple> fileTuples = null;
    List<InputSplit> splits = new ArrayList<InputSplit>();
    //TODO Bug - hashcode and equals not overridden in OptimTuple. If not HashSet, then fine.
    Set<OptimTuple> locations = new HashSet<OptimTuple>();

    long totalBatchSize = 0;

    try {/* w  w  w . j  a v  a2 s  .  c  om*/

        System.out.println("Scanning source location...");

        dcmInCodec = DCMCodecFactory.getCodec(conf, dcmConfig.getSourceConfig().getDefaultConnectionConfig());

        //If includeList is used, then path is not considered, in either way excludeList been considered.
        if (includeList != null && includeList.size() > 0) {
            fileTuples = dcmInCodec.getInputPaths(includeList, excludeList);
        } else {
            fileTuples = dcmInCodec.getInputPaths(dcmConfig.getSourceConfig().getPath(), excludeList);
        }

        stateManager = StateManagerFactory.getStateManager(conf, dcmConfig);

        System.out.println("Fetching previous transfer states from StateManager...");
        previousState = stateManager.getPreviousTransferStatus();

        System.out.println("Filtering Input File Set based on User defined filters.");
        for (FileTuple fileTuple : fileTuples) {

            if (!ignoreFile(fileTuple, excludeList, previousState)) {

                locations.add(new OptimTuple(fileTuple.getFileName(), fileTuple.getSize()));
                inputFileMap.put(fileTuple.getFileName(), fileTuple);
            }
        }

        System.out.println("Optimizing Splits...");

        WorkloadOptimizer optimizer = DCMCodecFactory
                .getCodecWorkloadOptimizer(dcmConfig.getSinkConfig().getDefaultConnectionConfig());
        splits.addAll(optimizer.optimizeWorkload(dcmConfig, locations, inputFileMap));

        sortSplits(splits);
        System.out.println(
                "Total input paths to process: " + locations.size() + ", Total input splits: " + splits.size());
        System.out.println("Total Data to Transfer: " + totalBatchSize);

        stateManager.savePreviousTransferStatus(previousState);
    } catch (Exception e) {
        throw new IOException(e);
    }

    System.out.println("Done Calculating splits...");
    return splits;
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
protected List<FileStatus> listStatus(final JobContext job) throws IOException {

    final List<FileStatus> files = Lists.newArrayList();

    for (FileStatus file : super.listStatus(job)) {
        files.addAll(handleFile(file, job));
    }/*ww w. j ava2  s  .c om*/

    LOG.debug("Initial file list: {} {}", files.size(), files);

    final Configuration configuration = job.getConfiguration();

    for (Iterator<FileStatus> iterator = files.iterator(); iterator.hasNext();) {
        final FileStatus fileStatus = iterator.next();
        final Path file = fileStatus.getPath();
        final FileSystem fs = file.getFileSystem(configuration);

        if (!SSTablePredicates.IS_SSTABLE.apply(file.toString())) {
            // Ignore non-sstable date files, always (for now)
            LOG.debug("Removing non-sstable file: {}", file);
            iterator.remove();
        } else {
            // read the index file
            LOG.debug("Reading index file for sstable file: {}", file);

            final Path indexFile = SSTableFunctions.INDEX_FILE.apply(file);

            LOG.debug("Reading index file: {}", indexFile);

            final SSTableIndexIndex index = SSTableIndexIndex.readIndex(fs, indexFile);
            indexes.put(file, index);
        }
    }

    LOG.debug("Final file list: {} {}", files.size(), files);

    return files;
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

/**
 * If we have a directory recursively gather the files we care about for this job.
 *
 * @param file Root file/directory.//from w w  w. j a v a 2  s .  com
 * @param job Job context.
 * @return All files we care about.
 * @throws IOException
 */
private Collection<FileStatus> handleFile(final FileStatus file, final JobContext job) throws IOException {
    final List<FileStatus> results = Lists.newArrayList();

    if (file.isDir()) {
        final Path p = file.getPath();
        LOG.debug("Expanding {}", p);
        final FileSystem fs = p.getFileSystem(job.getConfiguration());
        final FileStatus[] children = fs.listStatus(p);
        for (FileStatus child : children) {
            results.addAll(handleFile(child, job));
        }
    } else {
        results.add(file);
    }

    return results;
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext job) throws IOException {
    final Configuration configuration = job.getConfiguration();

    final List<InputSplit> result = Lists.newArrayList();

    final List<FileStatus> files = listStatus(job);

    LOG.debug("Initial file list: {} {}", files.size(), files);

    for (final FileStatus fileStatus : files) {
        final Path dataFile = fileStatus.getPath();
        final FileSystem fileSystem = dataFile.getFileSystem(configuration);
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0,
                fileStatus.getLen());//from   w  ww . j a v a  2  s . co  m

        // Data file, try to split if the .index file was found
        final SSTableIndexIndex index = indexes.get(dataFile);
        if (index == null) {
            throw new IOException("Index not found for " + dataFile);
        }

        for (final SSTableIndexIndex.Chunk chunk : index.getOffsets()) {
            // This isn't likely to work well because we are dealing with the index into uncompressed data...
            final int blockIndex = getBlockIndex(blockLocations,
                    chunk.getStart() / COMPRESSION_RATIO_ASSUMPTION);
            final SSTableSplit split = new SSTableSplit(dataFile, chunk.getStart(), chunk.getEnd(),
                    chunk.getEnd() - chunk.getStart(), blockLocations[blockIndex].getHosts());
            result.add(split);
        }
    }

    LOG.debug("Splits calculated: {} {}", result.size(), result);

    return result;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.GFInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    this.conf = job.getConfiguration();

    Collection<FileStatus> hoplogs = getHoplogs();
    return createSplits(hoplogs);
}