List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
/** * Returns if the counters for the named outputs are enabled or not. By default these counters are disabled. * /* w w w .ja v a2s. co m*/ * @param job * the job * @return TRUE if the counters are enabled, FALSE if they are disabled. */ public static boolean getCountersEnabled(JobContext job) { return job.getConfiguration().getBoolean(COUNTERS_ENABLED, false); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.ProxyOutputFormat.java
License:Apache License
private void createOutputFormatIfNeeded(JobContext context) throws IOException { if (outputFormat == null) { outputFormat = InstancesDistributor.loadInstance(context.getConfiguration(), OutputFormat.class, context.getConfiguration().get(PROXIED_OUTPUT_FORMAT_CONF, null), true); }/*from ww w . j a v a2 s . com*/ }
From source file:com.david.mos.out.FileOutputFormat.java
License:Apache License
public void checkOutputSpecs(JobContext job) throws FileAlreadyExistsException, IOException { // Ensure that the output directory is set and not already there Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set."); }/* w ww. j a v a2 s. c om*/ // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration()); if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) { throw new FileAlreadyExistsException("Output directory " + outDir + " already exists"); } }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Get the minimum split size//w w w .j ava 2 s . c o m * @param job the job * @return the minimum number of bytes that can be in a split */ public static long getMinSplitSize(JobContext job) { return job.getConfiguration().getLong(SPLIT_MINSIZE, 1L); }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Get the maximum split size.//w w w .ja v a 2 s . c o m * @param context the job to look at. * @return the maximum number of bytes a split can include */ public static long getMaxSplitSize(JobContext context) { return context.getConfiguration().getLong(SPLIT_MAXSIZE, Long.MAX_VALUE); }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Get a PathFilter instance of the filter set for the input paths. * * @return the PathFilter instance set for the job, NULL if none has been set. *//*w ww . j a v a2s . c om*/ public static PathFilter getInputPathFilter(JobContext context) { Configuration conf = context.getConfiguration(); Class<?> filterClass = conf.getClass(PATHFILTER_CLASS, null, PathFilter.class); return (filterClass != null) ? (PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null; }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. //from www . j ava 2 s.c o m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context// www . j a v a 2 s. c o m * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { FileSystem fs = path.getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Get the list of input {@link Path}s for the map-reduce job. * //from w w w . j av a 2 s.c om * @param context The job * @return the list of input {@link Path}s for the map-reduce job. */ public static Path[] getInputPaths(JobContext context) { String dirs = context.getConfiguration().get(INPUT_DIR, ""); String[] list = StringUtils.split(dirs); Path[] result = new Path[list.length]; for (int i = 0; i < list.length; i++) { result[i] = new Path(StringUtils.unEscapeString(list[i])); } return result; }
From source file:com.edwardsit.spark4n6.EWFImageInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { log.setLevel(Level.DEBUG);//from ww w . j a v a2 s . c o m List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); BlockLocation[] blkLocations = null; Path path = null; FileSystem fs = null; EWFFileReader ewf = null; ArrayList<EWFSection.SectionPrefix> sections = null; Iterator<EWFSection.SectionPrefix> it = null; EWFSection.SectionPrefix sp = null; Path priorFile = null; long priorOffset = 0L; FileStatus priorFileStatus = null; chunkSize = new EWFSegmentFileReader(fs).DEFAULT_CHUNK_SIZE; long priorStart = 0L; int blkIndex = 0; for (FileStatus file : files) { path = file.getPath(); fs = path.getFileSystem(job.getConfiguration()); if (path.getName().endsWith(".E01")) { ewf = new EWFFileReader(fs, path); sections = ewf.getSectionPrefixArray(); it = sections.iterator(); while (it.hasNext()) { sp = it.next(); if (sp.sectionType.equals(EWFSection.SectionType.TABLE_TYPE)) { priorFileStatus = fs.getFileStatus(priorFile); for (long i = sp.chunkCount; i > 0L; i = i - getChunksPerSplit(priorFileStatus)) { if (priorFileStatus instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) priorFileStatus).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(priorFileStatus, priorOffset, (getChunksPerSplit(priorFileStatus) * chunkSize)); } blkIndex = getBlockIndex(blkLocations, priorOffset); if (i > getChunksPerSplit(priorFileStatus)) { log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize) + ", " + (getChunksPerSplit(priorFileStatus) * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");"); splits.add(makeSplit(priorFile, (priorStart * chunkSize), (getChunksPerSplit(priorFileStatus) * chunkSize), blkLocations[blkIndex].getHosts())); priorStart += getChunksPerSplit(priorFileStatus); } else { log.debug("splits.add(makeSplit(" + priorFile + ", " + (priorStart * chunkSize) + ", " + (i * chunkSize) + ", " + listHosts(blkLocations, blkIndex) + ");"); splits.add(makeSplit(priorFile, (priorStart * chunkSize), (i * chunkSize), blkLocations[blkIndex].getHosts())); priorStart += i; } } } priorFile = sp.file; priorOffset = sp.fileOffset; } } } return splits; }