Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

/**
 * Splitter building logic including master setting, also includes combining input feature like Pig.
 */// www .  jav  a2s .c  o  m
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> newSplits = null;
    boolean combinable = job.getConfiguration().getBoolean(GuaguaConstants.GUAGUA_SPLIT_COMBINABLE, false);
    if (combinable) {
        @SuppressWarnings("deprecation")
        // use this deprecation method to make it works on 0.20.2
        long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize();
        long combineSize = job.getConfiguration().getLong(GuaguaConstants.GUAGUA_SPLIT_MAX_COMBINED_SPLIT_SIZE,
                blockSize);
        if (combineSize == 0) {
            combineSize = blockSize;
        }
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l);
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize);
        List<InputSplit> splits = super.getSplits(job);
        LOG.debug("combine size:{}, splits:{}", combineSize, splits);
        newSplits = getFinalCombineGuaguaSplits(splits, combineSize);
    } else {
        newSplits = getGuaguaSplits(job);
    }
    int masters = job.getConfiguration().getInt(GuaguaConstants.GUAGUA_MASTER_NUMBER,
            GuaguaConstants.DEFAULT_MASTER_NUMBER);
    for (int i = 0; i < masters; i++) {
        newSplits.add(new GuaguaInputSplit(true, (FileSplit) null));
    }
    int mapperSize = newSplits.size();
    LOG.info("Input size including master: {}", mapperSize);
    LOG.debug("input splits: {}", newSplits);
    job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - masters) + "");
    return newSplits;
}

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *//*from  www  . j  av a 2s .  c o m*/
protected List<InputSplit> getGuaguaSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > GuaguaMapReduceConstants.SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining,
                        bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.core.correlation.CorrelationMultithreadedMapper.java

License:Apache License

/**
 * The number of threads in the thread pool that will run the map function.
 * /* w  ww .ja v a  2  s .c om*/
 * @param job
 *            the job
 * @return the number of threads
 */
public static int getNumberOfThreads(JobContext job) {
    return job.getConfiguration().getInt(NUM_THREADS, 10);
}

From source file:ml.shifu.shifu.core.correlation.CorrelationMultithreadedMapper.java

License:Apache License

/**
 * Get the application's mapper class./*  w ww  . j  a  va  2  s .co m*/
 * 
 * @param <K1>
 *            the map's input key type
 * @param <V1>
 *            the map's input value type
 * @param <K2>
 *            the map's output key type
 * @param <V2>
 *            the map's output value type
 * @param job
 *            the job
 * @return the mapper class to run
 */
@SuppressWarnings("unchecked")
public static <K1, V1, K2, V2> Class<Mapper<K1, V1, K2, V2>> getMapperClass(JobContext job) {
    return (Class<Mapper<K1, V1, K2, V2>>) job.getConfiguration().getClass(MAP_CLASS, Mapper.class);
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

/**
 * Splitter building logic including master setting, also includes combining input feature like Pig.
 */// www . j a  v  a 2  s . c  om
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> newSplits = null;
    boolean combinable = job.getConfiguration().getBoolean(SHIFU_VS_SPLIT_COMBINABLE, false);
    if (combinable) {
        @SuppressWarnings("deprecation")
        // use this deprecation method to make it works on 0.20.2
        long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize();
        long combineSize = job.getConfiguration().getLong(SHIFU_VS_SPLIT_MAX_COMBINED_SPLIT_SIZE,
                blockSize * 2);
        if (combineSize == 0) {
            combineSize = blockSize;
        }
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l);
        job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize);
        List<InputSplit> splits = super.getSplits(job);
        LOG.debug("combine size:{}, splits:{}", combineSize, splits);
        newSplits = getFinalCombineVarSelectSplits(splits, combineSize);
    } else {
        newSplits = getVarSelectSplits(job);
    }
    LOG.info("Input size: {}", newSplits.size());
    return newSplits;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 *///w ww .  ja v a 2 s  .c o m
protected List<InputSplit> getVarSelectSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);

            long bytesRemaining = length;
            // here double comparison can be directly used because of no precision requirement
            while (((double) bytesRemaining) / splitSize > 1.1d) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, splitSize,
                        blkLocations[blkIndex].getHosts())));
                bytesRemaining -= splitSize;
            }

            if (bytesRemaining != 0) {
                splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                        blkLocations[blkLocations.length - 1].getHosts())));
            }
        } else if (length != 0) {
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, blkLocations[0].getHosts())));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new CombineInputSplit(new FileSplit(path, 0, length, new String[0])));
        }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: {}", splits.size());
    return splits;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    // All compression types set here non split. For bz or bz2, think about how to do combine.
    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return codec == null;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

/**
 * Splitter building logic including master setting, also includes combining input feature like Pig.
 *//*  ww w. ja  va  2  s.c  o m*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> newSplits = super.getSplits(job);
    String testDirs = job.getConfiguration().get("shifu.crossValidation.dir", "");
    LOG.info("Validation dir is {};", testDirs);
    if (org.apache.commons.lang.StringUtils.isNotBlank(testDirs)) {
        this.addCrossValidationDataset(newSplits, job);
    }
    return newSplits;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

protected List<List<FileSplit>> getCrossValidationSplits(JobContext job, int count) throws IOException {
    LOG.debug("Split validation with count: {}", count);
    List<FileStatus> files = listCrossValidationStatus(job);
    List<FileSplit> current = new ArrayList<FileSplit>();
    List<List<FileSplit>> validationList = new ArrayList<List<FileSplit>>();
    long lengthSum = 0L;
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }/*from w  w w .j a v a 2  s  . co m*/
        lengthSum += file.getLen();
    }
    long size = lengthSum / count + 1;
    long remaining = 0L;
    for (FileStatus file : files) {
        Path path = file.getPath();
        if (isPigOrHadoopMetaFile(path)) {
            continue;
        }
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        long offset = 0L;
        long length = file.getLen();
        if (length + remaining >= size) {
            long cut = (size - remaining) >= length ? length : (size - remaining);
            current.add(getFileSplit(fs, file, offset, cut));
            offset = cut;
            remaining = length - cut;
            validationList.add(current);
            current = new ArrayList<FileSplit>();
            while (remaining >= size) {
                current.add(getFileSplit(fs, file, offset, size));
                validationList.add(current);
                current = new ArrayList<FileSplit>();
                remaining -= size;
                offset += size;
            }
            if (remaining > 0) {
                current.add(getFileSplit(fs, file, offset, remaining));
            }

        } else {
            current.add(getFileSplit(fs, file, 0, length));
            remaining += length;
        }
    }
    if (current.size() > 0) {
        validationList.add(current);
    }

    LOG.debug("Total # of validationList: {}", validationList.size());
    return validationList;
}

From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java

License:Apache License

@SuppressWarnings("deprecation")
protected List<FileStatus> listCrossValidationStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/*from w w  w.j  ava2 s . c  om*/

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = job.getConfiguration().getBoolean("mapreduce.input.fileinputformat.input.dir.recursive",
            false);

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    FileStatus[] fss = fs.listStatus(globStat.getPath());
                    for (FileStatus fileStatus : fss) {
                        if (inputFilter.accept(fileStatus.getPath())) {
                            if (recursive && fileStatus.isDir()) {
                                addInputPathRecursive(result, fs, fileStatus.getPath(), inputFilter);
                            } else {
                                result.add(fileStatus);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total validation paths to process : " + result.size());
    return result;
}