Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.ery.hadoop.mrddx.hbase.HbaseInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {

    String startRow = this.dbConf.getInputHBaseQueryStartRow();
    String stopRow = this.dbConf.getInputHBaseQueryStopRow();
    String tableName = this.dbConf.getInputTableName();

    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<HRegionLocation> lstHRegionLocation = getTableHRegionInfo(job.getConfiguration(), tableName, startRow,
            stopRow);/*w w w  .j av a  2  s. c  om*/
    for (int i = 0; i < lstHRegionLocation.size(); i++) {
        HRegionLocation hRegionLocation = lstHRegionLocation.get(i);
        String tempStart = null;
        String tempEnd = null;
        HRegionInfo hRegionInfo = hRegionLocation.getRegionInfo();
        // NO.1?
        if (null == startRow && null == stopRow) {
            tempStart = new String(hRegionInfo.getStartKey());
            tempEnd = new String(hRegionInfo.getEndKey());
            HbaseInputSplit split = new HbaseInputSplit(tempStart, tempEnd);
            splits.add(split);
            continue;
        }

        // NO.2?
        byte[] startKeyByte = hRegionInfo.getStartKey();
        byte[] endKeyByte = hRegionInfo.getEndKey();
        if (null != startRow && hRegionInfo.containsRow(startRow.getBytes())) {
            tempStart = startRow;
        }

        if (null != stopRow && hRegionInfo.containsRow(stopRow.getBytes())) {
            tempEnd = stopRow;
        }

        tempStart = tempStart != null ? tempStart : new String(startKeyByte);
        tempEnd = tempEnd != null ? tempEnd : new String(endKeyByte);
        HbaseInputSplit split = new HbaseInputSplit(tempStart, tempEnd);
        splits.add(split);
    }

    MRLog.info(LOG, "Finished hbase split!");
    return splits;
}

From source file:com.facebook.hiveio.input.HiveApiInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    HiveInputDescription inputDesc = readProfileInputDesc(conf);

    ThriftHiveMetastore.Iface client;//from  ww  w  .  ja  v  a  2 s  .  c  o m
    try {
        client = inputDesc.metastoreClient(conf);
    } catch (TException e) {
        throw new IOException(e);
    }

    return getSplits(conf, inputDesc, client);
}

From source file:com.facebook.hiveio.output.HiveApiOutputCommitter.java

License:Apache License

@Override
public void commitJob(JobContext jobContext) throws IOException {
    baseCommitter.commitJob(jobContext);

    Configuration conf = jobContext.getConfiguration();
    OutputConf outputConf = new OutputConf(conf, profileId);
    HiveOutputDescription outputDesc = outputConf.readOutputDescription();
    OutputInfo outputInfo = outputConf.readOutputTableInfo();
    if (outputInfo.hasPartitionInfo()) {
        registerPartitions(conf, outputDesc, outputInfo);
    } else {//from   w w  w  . j a  v a 2s. com
        noPartitionsCopyData(conf, outputInfo);
    }

    writeSuccessFile(conf);
}

From source file:com.facebook.hiveio.output.HiveApiOutputCommitter.java

License:Apache License

@Override
public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
    baseCommitter.abortJob(jobContext, state);
    HadoopUtils.deleteOutputDir(jobContext.getConfiguration());
}

From source file:com.facebook.hiveio.output.HiveApiOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    OutputConf outputConf = new OutputConf(conf, myProfileId);

    HiveOutputDescription description = outputConf.readOutputDescription();
    OutputInfo oti = outputConf.readOutputTableInfo();
    LOG.info("Check output specs of " + description);

    if (description == null) {
        LOG.error("HiveOutputDescription is null in Configuration, nothing to check");
        return;//from  w w w. j  ava2  s .c o  m
    }
    checkTableExists(conf, description);

    if (oti == null) {
        LOG.error("OutputInfo is null in Configuration, nothing to check");
        return;
    }
    checkPartitionInfo(conf, description, oti, outputConf);
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    System.out.println("Calculating Job Splits...");

    conf = context.getConfiguration();
    dcmConfig = MirrorUtils.getConfigFromConf(conf);

    Set<String> excludeList = getExclusionsFileList(conf);
    Set<String> includeList = getInclusionFileList(conf);

    HashMap<String, FileTuple> inputFileMap = new HashMap<String, FileTuple>();
    Map<String, TransferStatus> previousState = null;

    List<FileTuple> fileTuples = null;
    List<InputSplit> splits = new ArrayList<InputSplit>();
    //TODO Bug - hashcode and equals not overridden in OptimTuple. If not HashSet, then fine.
    Set<OptimTuple> locations = new HashSet<OptimTuple>();

    long totalBatchSize = 0;

    try {/* w  w  w . j  a v  a2 s  .  c  om*/

        System.out.println("Scanning source location...");

        dcmInCodec = DCMCodecFactory.getCodec(conf, dcmConfig.getSourceConfig().getDefaultConnectionConfig());

        //If includeList is used, then path is not considered, in either way excludeList been considered.
        if (includeList != null && includeList.size() > 0) {
            fileTuples = dcmInCodec.getInputPaths(includeList, excludeList);
        } else {
            fileTuples = dcmInCodec.getInputPaths(dcmConfig.getSourceConfig().getPath(), excludeList);
        }

        stateManager = StateManagerFactory.getStateManager(conf, dcmConfig);

        System.out.println("Fetching previous transfer states from StateManager...");
        previousState = stateManager.getPreviousTransferStatus();

        System.out.println("Filtering Input File Set based on User defined filters.");
        for (FileTuple fileTuple : fileTuples) {

            if (!ignoreFile(fileTuple, excludeList, previousState)) {

                locations.add(new OptimTuple(fileTuple.getFileName(), fileTuple.getSize()));
                inputFileMap.put(fileTuple.getFileName(), fileTuple);
            }
        }

        System.out.println("Optimizing Splits...");

        WorkloadOptimizer optimizer = DCMCodecFactory
                .getCodecWorkloadOptimizer(dcmConfig.getSinkConfig().getDefaultConnectionConfig());
        splits.addAll(optimizer.optimizeWorkload(dcmConfig, locations, inputFileMap));

        sortSplits(splits);
        System.out.println(
                "Total input paths to process: " + locations.size() + ", Total input splits: " + splits.size());
        System.out.println("Total Data to Transfer: " + totalBatchSize);

        stateManager.savePreviousTransferStatus(previousState);
    } catch (Exception e) {
        throw new IOException(e);
    }

    System.out.println("Done Calculating splits...");
    return splits;
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
protected List<FileStatus> listStatus(final JobContext job) throws IOException {

    final List<FileStatus> files = Lists.newArrayList();

    for (FileStatus file : super.listStatus(job)) {
        files.addAll(handleFile(file, job));
    }/*ww w. j ava2  s  .c om*/

    LOG.debug("Initial file list: {} {}", files.size(), files);

    final Configuration configuration = job.getConfiguration();

    for (Iterator<FileStatus> iterator = files.iterator(); iterator.hasNext();) {
        final FileStatus fileStatus = iterator.next();
        final Path file = fileStatus.getPath();
        final FileSystem fs = file.getFileSystem(configuration);

        if (!SSTablePredicates.IS_SSTABLE.apply(file.toString())) {
            // Ignore non-sstable date files, always (for now)
            LOG.debug("Removing non-sstable file: {}", file);
            iterator.remove();
        } else {
            // read the index file
            LOG.debug("Reading index file for sstable file: {}", file);

            final Path indexFile = SSTableFunctions.INDEX_FILE.apply(file);

            LOG.debug("Reading index file: {}", indexFile);

            final SSTableIndexIndex index = SSTableIndexIndex.readIndex(fs, indexFile);
            indexes.put(file, index);
        }
    }

    LOG.debug("Final file list: {} {}", files.size(), files);

    return files;
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

/**
 * If we have a directory recursively gather the files we care about for this job.
 *
 * @param file Root file/directory.//from w w  w. j a v a 2  s .  com
 * @param job Job context.
 * @return All files we care about.
 * @throws IOException
 */
private Collection<FileStatus> handleFile(final FileStatus file, final JobContext job) throws IOException {
    final List<FileStatus> results = Lists.newArrayList();

    if (file.isDir()) {
        final Path p = file.getPath();
        LOG.debug("Expanding {}", p);
        final FileSystem fs = p.getFileSystem(job.getConfiguration());
        final FileStatus[] children = fs.listStatus(p);
        for (FileStatus child : children) {
            results.addAll(handleFile(child, job));
        }
    } else {
        results.add(file);
    }

    return results;
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext job) throws IOException {
    final Configuration configuration = job.getConfiguration();

    final List<InputSplit> result = Lists.newArrayList();

    final List<FileStatus> files = listStatus(job);

    LOG.debug("Initial file list: {} {}", files.size(), files);

    for (final FileStatus fileStatus : files) {
        final Path dataFile = fileStatus.getPath();
        final FileSystem fileSystem = dataFile.getFileSystem(configuration);
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, 0,
                fileStatus.getLen());//from   w  ww . j a v a  2  s . co  m

        // Data file, try to split if the .index file was found
        final SSTableIndexIndex index = indexes.get(dataFile);
        if (index == null) {
            throw new IOException("Index not found for " + dataFile);
        }

        for (final SSTableIndexIndex.Chunk chunk : index.getOffsets()) {
            // This isn't likely to work well because we are dealing with the index into uncompressed data...
            final int blockIndex = getBlockIndex(blockLocations,
                    chunk.getStart() / COMPRESSION_RATIO_ASSUMPTION);
            final SSTableSplit split = new SSTableSplit(dataFile, chunk.getStart(), chunk.getEnd(),
                    chunk.getEnd() - chunk.getStart(), blockLocations[blockIndex].getHosts());
            result.add(split);
        }
    }

    LOG.debug("Splits calculated: {} {}", result.size(), result);

    return result;
}

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.mapreduce.GFInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    this.conf = job.getConfiguration();

    Collection<FileStatus> hoplogs = getHoplogs();
    return createSplits(hoplogs);
}