List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java
License:Apache License
/** * Splitter building logic including master setting, also includes combining input feature like Pig. */// www . jav a2s .c o m @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> newSplits = null; boolean combinable = job.getConfiguration().getBoolean(GuaguaConstants.GUAGUA_SPLIT_COMBINABLE, false); if (combinable) { @SuppressWarnings("deprecation") // use this deprecation method to make it works on 0.20.2 long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize(); long combineSize = job.getConfiguration().getLong(GuaguaConstants.GUAGUA_SPLIT_MAX_COMBINED_SPLIT_SIZE, blockSize); if (combineSize == 0) { combineSize = blockSize; } job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l); job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize); List<InputSplit> splits = super.getSplits(job); LOG.debug("combine size:{}, splits:{}", combineSize, splits); newSplits = getFinalCombineGuaguaSplits(splits, combineSize); } else { newSplits = getGuaguaSplits(job); } int masters = job.getConfiguration().getInt(GuaguaConstants.GUAGUA_MASTER_NUMBER, GuaguaConstants.DEFAULT_MASTER_NUMBER); for (int i = 0; i < masters; i++) { newSplits.add(new GuaguaInputSplit(true, (FileSplit) null)); } int mapperSize = newSplits.size(); LOG.info("Input size including master: {}", mapperSize); LOG.debug("input splits: {}", newSplits); job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - masters) + ""); return newSplits; }
From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. *//*from www . j av a 2s . c o m*/ protected List<InputSplit> getGuaguaSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; } FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > GuaguaMapReduceConstants.SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()))); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new GuaguaInputSplit(false, new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()))); } } else if (length != 0) { splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, blkLocations[0].getHosts()))); } else { // Create empty hosts array for zero length files splits.add(new GuaguaInputSplit(false, new FileSplit(path, 0, length, new String[0]))); } } // Save the number of input files in the job-conf job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: {}", splits.size()); return splits; }
From source file:ml.shifu.shifu.core.correlation.CorrelationMultithreadedMapper.java
License:Apache License
/** * The number of threads in the thread pool that will run the map function. * /* w ww .ja v a 2 s .c om*/ * @param job * the job * @return the number of threads */ public static int getNumberOfThreads(JobContext job) { return job.getConfiguration().getInt(NUM_THREADS, 10); }
From source file:ml.shifu.shifu.core.correlation.CorrelationMultithreadedMapper.java
License:Apache License
/** * Get the application's mapper class./* w ww . j a va 2 s .co m*/ * * @param <K1> * the map's input key type * @param <V1> * the map's input value type * @param <K2> * the map's output key type * @param <V2> * the map's output value type * @param job * the job * @return the mapper class to run */ @SuppressWarnings("unchecked") public static <K1, V1, K2, V2> Class<Mapper<K1, V1, K2, V2>> getMapperClass(JobContext job) { return (Class<Mapper<K1, V1, K2, V2>>) job.getConfiguration().getClass(MAP_CLASS, Mapper.class); }
From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java
License:Apache License
/** * Splitter building logic including master setting, also includes combining input feature like Pig. */// www . j a v a 2 s . c om @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> newSplits = null; boolean combinable = job.getConfiguration().getBoolean(SHIFU_VS_SPLIT_COMBINABLE, false); if (combinable) { @SuppressWarnings("deprecation") // use this deprecation method to make it works on 0.20.2 long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize(); long combineSize = job.getConfiguration().getLong(SHIFU_VS_SPLIT_MAX_COMBINED_SPLIT_SIZE, blockSize * 2); if (combineSize == 0) { combineSize = blockSize; } job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l); job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize); List<InputSplit> splits = super.getSplits(job); LOG.debug("combine size:{}, splits:{}", combineSize, splits); newSplits = getFinalCombineVarSelectSplits(splits, combineSize); } else { newSplits = getVarSelectSplits(job); } LOG.info("Input size: {}", newSplits.size()); return newSplits; }
From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. *///w ww . ja v a 2 s .c o m protected List<InputSplit> getVarSelectSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; } FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; // here double comparison can be directly used because of no precision requirement while (((double) bytesRemaining) / splitSize > 1.1d) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()))); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new CombineInputSplit(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()))); } } else if (length != 0) { splits.add(new CombineInputSplit(new FileSplit(path, 0, length, blkLocations[0].getHosts()))); } else { // Create empty hosts array for zero length files splits.add(new CombineInputSplit(new FileSplit(path, 0, length, new String[0]))); } } // Save the number of input files in the job-conf job.getConfiguration().setLong(GuaguaMapReduceConstants.NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: {}", splits.size()); return splits; }
From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { // All compression types set here non split. For bz or bz2, think about how to do combine. CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); return codec == null; }
From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java
License:Apache License
/** * Splitter building logic including master setting, also includes combining input feature like Pig. *//* ww w. ja va 2 s.c o m*/ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> newSplits = super.getSplits(job); String testDirs = job.getConfiguration().get("shifu.crossValidation.dir", ""); LOG.info("Validation dir is {};", testDirs); if (org.apache.commons.lang.StringUtils.isNotBlank(testDirs)) { this.addCrossValidationDataset(newSplits, job); } return newSplits; }
From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java
License:Apache License
protected List<List<FileSplit>> getCrossValidationSplits(JobContext job, int count) throws IOException { LOG.debug("Split validation with count: {}", count); List<FileStatus> files = listCrossValidationStatus(job); List<FileSplit> current = new ArrayList<FileSplit>(); List<List<FileSplit>> validationList = new ArrayList<List<FileSplit>>(); long lengthSum = 0L; for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; }/*from w w w .j a v a 2 s . co m*/ lengthSum += file.getLen(); } long size = lengthSum / count + 1; long remaining = 0L; for (FileStatus file : files) { Path path = file.getPath(); if (isPigOrHadoopMetaFile(path)) { continue; } FileSystem fs = path.getFileSystem(job.getConfiguration()); long offset = 0L; long length = file.getLen(); if (length + remaining >= size) { long cut = (size - remaining) >= length ? length : (size - remaining); current.add(getFileSplit(fs, file, offset, cut)); offset = cut; remaining = length - cut; validationList.add(current); current = new ArrayList<FileSplit>(); while (remaining >= size) { current.add(getFileSplit(fs, file, offset, size)); validationList.add(current); current = new ArrayList<FileSplit>(); remaining -= size; offset += size; } if (remaining > 0) { current.add(getFileSplit(fs, file, offset, remaining)); } } else { current.add(getFileSplit(fs, file, 0, length)); remaining += length; } } if (current.size() > 0) { validationList.add(current); } LOG.debug("Total # of validationList: {}", validationList.size()); return validationList; }
From source file:ml.shifu.shifu.guagua.ShifuInputFormat.java
License:Apache License
@SuppressWarnings("deprecation") protected List<FileStatus> listCrossValidationStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }/*from w w w.j ava2 s . c om*/ // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = job.getConfiguration().getBoolean("mapreduce.input.fileinputformat.input.dir.recursive", false); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { FileStatus[] fss = fs.listStatus(globStat.getPath()); for (FileStatus fileStatus : fss) { if (inputFilter.accept(fileStatus.getPath())) { if (recursive && fileStatus.isDir()) { addInputPathRecursive(result, fs, fileStatus.getPath(), inputFilter); } else { result.add(fileStatus); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total validation paths to process : " + result.size()); return result; }