Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

/**
 * Splitter building logic including master setting, also includes combining input feature like Pig.
 */// www .  jav  a2s .c  o  m
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> newSplits = null;
    boolean combinable = job.getConfiguration().getBoolean(GuaguaConstants.GUAGUA_SPLIT_COMBINABLE, false);
    if (combinable) {
        @SuppressWarnings("deprecation")
        // use this deprecation method to make it works on 0.20.2
        long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize();
        long combineSize = job.getConfiguration().getLong(GuaguaConstants.GUAGUA_SPLIT_MAX_COMBINED_SPLIT_SIZE,
                blockSize);
        if (combineSize == 0) {
            combineSize = blockSize;
        }
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l);
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize);
        List<InputSplit> splits = super.getSplits(job);
        LOG.debug("combine size:{}, splits:{}", combineSize, splits);
        newSplits = getFinalCombineGuaguaSplits(splits, combineSize);
    } else {
        newSplits = getGuaguaSplits(job);
    }
    int masters = job.getConfiguration().getInt(GuaguaConstants.GUAGUA_MASTER_NUMBER,
            GuaguaConstants.DEFAULT_MASTER_NUMBER);
    for (int i = 0; i < masters; i++) {
        newSplits.add(new GuaguaInputSplit(true, (FileSplit) null));
    }
    int mapperSize = newSplits.size();
    LOG.info("Input size including master: {}", mapperSize);
    LOG.debug("input splits: {}", newSplits);
    job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - masters) + "");
    return newSplits;
}

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from  www  . j  av a 2s .  c o m*/
protected List<InputSplit> getGuaguaSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaMapReduceConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining,
                        bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.core.correlation.CorrelationMultithreadedMapper.java

License:Apache License

/**
 * The number of threads in the thread pool that will run the map function.
 * /* w  ww .ja v a  2  s .c om*/
 * @param job
 *            the job
 * @return the number of threads
 */
public static int getNumberOfThreads(JobContext job) {
    return job.getConfiguration().getInt(NUM_THREADS, 10);
}

From source file:ml.shifu.shifu.core.correlation.CorrelationMultithreadedMapper.java

License:Apache License

/**
 * Get the application's mapper class./*  w ww  . j  a  va  2  s .co m*/
 * 
 * @param <K1>
 *            the map's input key type
 * @param <V1>
 *            the map's input value type
 * @param <K2>
 *            the map's output key type
 * @param <V2>
 *            the map's output value type
 * @param job
 *            the job
 * @return the mapper class to run
 */
@SuppressWarnings("unchecked")
public static <K1, V1, K2, V2> Class<Mapper<K1, V1, K2, V2>> getMapperClass(JobContext job) {
    return (Class<Mapper<K1, V1, K2, V2>>) job.getConfiguration().getClass(MAP_CLASS, Mapper.class);
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

/**
 * Splitter building logic including master setting, also includes combining input feature like Pig.
 */// www . j a  v  a 2  s . c  om
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> newSplits = null;
    boolean combinable = job.getConfiguration().getBoolean(SHIFU_VS_SPLIT_COMBINABLE, false);
    if (combinable) {
        @SuppressWarnings("deprecation")
        // use this deprecation method to make it works on 0.20.2
        long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize();
        long combineSize = job.getConfiguration().getLong(SHIFU_VS_SPLIT_MAX_COMBINED_SPLIT_SIZE,
                blockSize * 2);
        if (combineSize == 0) {
            combineSize = blockSize;
        }
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l);
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize);
        List<InputSplit> splits = super.getSplits(job);
        LOG.debug("combine size:{}, splits:{}", combineSize, splits);
        newSplits = getFinalCombineVarSelectSplits(splits, combineSize);
    } else {
        newSplits = getVarSelectSplits(job);
    }
    LOG.info("Input size: {}", newSplits.size());
    return newSplits;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *///w ww .  ja v a 2 s  .c o m
protected List<InputSplit> getVarSelectSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            // here double comparison can be directly used because of no precision requirement
            while (((double) bytesRemaining) / splitSize > 1.1d) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    // All compression types set here non split. For bz or bz2, think about how to do combine.
    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return codec == null;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

/**
 * Splitter building logic including master setting, also includes combining input feature like Pig.
 *//*  ww w. ja  va  2  s.c  o m*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> newSplits = super.getSplits(job);
    String testDirs = job.getConfiguration().get("shifu.crossValidation.dir", "");
    LOG.info("Validation dir is {};", testDirs);
    if (org.apache.commons.lang.StringUtils.isNotBlank(testDirs)) {
        this.addCrossValidationDataset(newSplits, job);
    }
    return newSplits;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

protected List<List<FileSplit>> getCrossValidationSplits(JobContext job, int count) throws IOException {
    LOG.debug("Split validation with count: {}", count);
    List<FileStatus> files = listCrossValidationStatus(job);
    List<FileSplit> current = new ArrayList<FileSplit>();
    List<List<FileSplit>> validationList = new ArrayList<List<FileSplit>>();
    long lengthSum = 0L;
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }/*from w  w w .j a v a 2  s  . co m*/
        lengthSum += file.getLen();
    }
    long size = lengthSum / count + 1;
    long remaining = 0L;
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long offset = 0L;
        long length = file.getLen();
        if (length + remaining >= size) {
            long cut = (size - remaining) >= length ? length : (size - remaining);
            current.add(getFileSplit(fs, file, offset, cut));
            offset = cut;
            remaining = length - cut;
            validationList.add(current);
            current = new ArrayList<FileSplit>();
            while (remaining >= size) {
                current.add(getFileSplit(fs, file, offset, size));
                validationList.add(current);
                current = new ArrayList<FileSplit>();
                remaining -= size;
                offset += size;
            }
            if (remaining > 0) {
                current.add(getFileSplit(fs, file, offset, remaining));
            }

        } else {
            current.add(getFileSplit(fs, file, 0, length));
            remaining += length;
        }
    }
    if (current.size() > 0) {
        validationList.add(current);
    }

    LOG.debug("Total # of validationList: {}", validationList.size());
    return validationList;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

@SuppressWarnings("deprecation")
protected List<FileStatus> listCrossValidationStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/*from w w  w.j  ava2 s . c  om*/

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getConfiguration().getBoolean("mapreduce.input.fileinputformat.input.dir.recursive",
            false);

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    FileStatus[] fss = fs.listStatus(globStat.getPath());
                    for (FileStatus fileStatus : fss) {
                        if (inputFilter.accept(fileStatus.getPath())) {
                            if (recursive && fileStatus.isDir()) {
                                addInputPathRecursive(result, fs, fileStatus.getPath(), inputFilter);
                            } else {
                                result.add(fileStatus);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total validation paths to process : " + result.size());
    return result;
}