List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:edu.rutgers.ess.crs.utility.CSVInputFormat.java
License:Apache License
protected boolean isSplitable(final JobContext context, final Path file) { final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); return codec == null; }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java
License:Open Source License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { try {// www.j a va2 s . c om Configuration jobConf = job.getConfiguration(); // The block filter associated with this job BlockFilter blockFilter = null; if (jobConf.get(InputQueryRange) != null) { // This job requires a range query blockFilter = new RangeFilter(OperationsParams.getShape(jobConf, InputQueryRange)); } // Retrieve the BlockFilter set by the developers in the JobConf Class<? extends BlockFilter> blockFilterClass = jobConf.getClass(SpatialSite.FilterClass, null, BlockFilter.class); if (blockFilterClass != null) { BlockFilter userBlockFilter = blockFilterClass.newInstance(); blockFilter = blockFilter == null ? userBlockFilter : new CombineBlockFilter(blockFilter, userBlockFilter); } if (blockFilter == null) { // No block filter specified by user LOG.info("No block filter specified"); return super.listStatus(job); } // Get all blocks the user wants to process blockFilter.configure(jobConf); // Filter files based on user specified filter function List<FileStatus> result = new ArrayList<FileStatus>(); Path[] inputDirs = getInputPaths(job); for (Path dir : inputDirs) { FileSystem fs = dir.getFileSystem(jobConf); listStatus(fs, dir, result, blockFilter); } LOG.info("Spatial filter function matched with " + result.size() + " cells"); return result; } catch (InstantiationException e) { LOG.warn(e); return super.listStatus(job); } catch (IllegalAccessException e) { LOG.warn(e); return super.listStatus(job); } }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path file) { try {/*from www.j a va 2s.c o m*/ // Create compressionCodecs to be used by isSplitable method if (compressionCodecs == null) compressionCodecs = new CompressionCodecFactory(context.getConfiguration()); FileSystem fs = file.getFileSystem(context.getConfiguration()); // HDF files are not splittable if (file.getName().toLowerCase().endsWith(".hdf")) return false; final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec != null && !(codec instanceof SplittableCompressionCodec)) return false; // To avoid opening the file and checking the first 8-bytes to look for // an R-tree signature, we never split a file read over HTTP if (fs instanceof HTTPFileSystem) return false; // ... and never split a file less than 150MB to perform better with many small files if (fs.getFileStatus(file).getLen() < 150 * 1024 * 1024) return false; return !SpatialSite.isRTree(fs, file); } catch (IOException e) { LOG.warn("Error while determining whether a file is splittable", e); return false; // Safer to not split it } }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = super.getSplits(job); Configuration jobConf = job.getConfiguration(); if (jobConf.getInt(CombineSplits, 1) > 1) { long t1 = System.currentTimeMillis(); int combine = jobConf.getInt(CombineSplits, 1); /*/*ww w . j av a 2 s.c om*/ * Combine splits to reduce number of map tasks. Currently, this is done * using a greedy algorithm that combines splits based on how many hosts * they share. * TODO: Use a graph clustering algorithm where each vertex represents a * split, and each edge is weighted with number of shared hosts between * the two splits */ Vector<Vector<FileSplit>> openSplits = new Vector<Vector<FileSplit>>(); int maxNumberOfSplits = (int) Math.ceil((float) splits.size() / combine); List<InputSplit> combinedSplits = new Vector<InputSplit>(); for (InputSplit split : splits) { FileSplit fsplit = (FileSplit) split; int maxSimilarity = -1; // Best similarity found so far int bestFit = -1; // Index of a random open split with max similarity int numMatches = 0; // Number of splits with max similarity for (int i = 0; i < openSplits.size(); i++) { Vector<FileSplit> splitList = openSplits.elementAt(i); int similarity = 0; for (FileSplit otherSplit : splitList) { for (String host1 : fsplit.getLocations()) for (String host2 : otherSplit.getLocations()) if (host1.equals(host2)) similarity++; } if (similarity > maxSimilarity) { maxSimilarity = similarity; bestFit = i; numMatches = 1; } else if (similarity == maxSimilarity) { numMatches++; // Replace with a probability () for a reservoir sample double random = Math.random(); if (random < (double) 1 / numMatches) { // Replace the element in the reservoir bestFit = i; } } } if (maxSimilarity > 0 || (openSplits.size() + combinedSplits.size()) >= maxNumberOfSplits) { // Good fit || cannot create more open splits, // add it to an existing open split. Vector<FileSplit> bestList = openSplits.elementAt(bestFit); bestList.add(fsplit); if (bestList.size() > combine) { // Reached threshold for this list. Add it to combined splits combinedSplits.add(FileSplitUtil.combineFileSplits(bestList, 0, bestList.size())); // Remove it from open splits openSplits.remove(bestFit); } } else { // Bad fit && can add a new split // Create a new open split just for this one Vector<FileSplit> newOpenSplit = new Vector<FileSplit>(); newOpenSplit.add(fsplit); openSplits.addElement(newOpenSplit); } } // Add all remaining open splits to the list of combined splits for (Vector<FileSplit> openSplit : openSplits) { combinedSplits.add(FileSplitUtil.combineFileSplits(openSplit, 0, openSplit.size())); } String msg = String.format("Combined %d splits into %d combined splits", splits.size(), combinedSplits.size()); splits.clear(); splits.addAll(combinedSplits); long t2 = System.currentTimeMillis(); LOG.info(msg + " in " + ((t2 - t1) / 1000.0) + " seconds"); } return splits; }
From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java
License:Apache License
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split.//from w ww . j a v a 2 s . co m * * @see NLineInputFormat#getSplits(JobContext) */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); int numLinesPerSplit = getNumLinesPerSplit(job); for (FileStatus status : listStatus(job)) { splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit)); } return splits; }
From source file:format.OverlapInputFormat.java
License:BSD License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> files = super.listStatus(job); List<FileStatus> results = new ArrayList<FileStatus>(); // Configuration conf = HadoopUtils.getConfiguration(job); Configuration conf = job.getConfiguration(); boolean recursive = conf.getBoolean("mapred.input.dir.recursive", false); Iterator<FileStatus> it = files.iterator(); while (it.hasNext()) { FileStatus fileStatus = it.next(); FileSystem fs = fileStatus.getPath().getFileSystem(conf); addInputPath(results, fs, fileStatus, recursive); }/*from w w w . j av a 2s .c o m*/ LOG.debug("Total pcap input paths to process: " + results.size()); return results; }
From source file:format.OverlapInputFormat.java
License:BSD License
/****** @Override/* w w w . j av a 2 s.co m*/ public List<InputSplit> getSplits(JobContext job) throws IOException { Configuration conf = HadoopUtils.getConfiguration(job); List<InputSplit> defaultSplits = super.getSplits(job); List<InputSplit> result = new ArrayList<InputSplit>(); Path prevFile = null; FourMcBlockIndex prevIndex = null; for (InputSplit genericSplit : defaultSplits) { // Load the index. FileSplit fileSplit = (FileSplit) genericSplit; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); FourMcBlockIndex index; if (file.equals(prevFile)) { index = prevIndex; } else { index = FourMcBlockIndex.readIndex(fs, file); prevFile = file; prevIndex = index; } if (index == null) { throw new IOException("BlockIndex unreadable for " + file); } if (index.isEmpty()) { // leave the default split for empty block index result.add(fileSplit); continue; } long start = fileSplit.getStart(); long end = start + fileSplit.getLength(); long fourMcStart = index.alignSliceStartToIndex(start, end); long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen()); if (fourMcStart != FourMcBlockIndex.NOT_FOUND && fourMcEnd != FourMcBlockIndex.NOT_FOUND) { result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations())); LOG.debug("Added 4mc split for " + file + "[start=" + fourMcStart + ", length=" + (fourMcEnd - fourMcStart) + "]"); } } return result; } ******/ @Override public List<InputSplit> getSplits(JobContext context) { List<InputSplit> splits = new ArrayList<InputSplit>(); FileSystem fs = null; Path file = OverlapInputFormat.getInputPaths(context)[0]; Configuration conf = context.getConfiguration(); long blocksize = Long.parseLong(conf.get("dfs.blocksize")); // long overlap = Long.parseLong(conf.get("pcap.defaultsize")); long overlap = 16; FSDataInputStream in = null; try { fs = FileSystem.get(context.getConfiguration()); in = fs.open(file); long pos = 0; while (in.available() > 0) { FileSplit split = new FileSplit(file, pos, blocksize + overlap, new String[] {}); splits.add(split); pos += blocksize; in.skip(blocksize + overlap); } } catch (IOException e) { LOG.error(e.getLocalizedMessage()); } finally { if (in != null) { try { in.close(); } catch (Exception e) { } } ; if (fs != null) { try { fs.close(); } catch (Exception e) { } } ; } return splits; }
From source file:fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat.java
License:LGPL
@Override protected boolean isSplitable(JobContext context, Path file) { final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); if (null == codec) { return true; }/*from ww w . j a va 2s. co m*/ return codec instanceof SplittableCompressionCodec; }
From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext cx) throws IOException { Job modifiedJob = Job.getInstance(cx.getConfiguration()); setSplitSize(modifiedJob);//from w w w. ja va 2s . c o m FileInputFormat.setInputDirRecursive(modifiedJob, true); return cleanSplits(super.getSplits(modifiedJob)); }
From source file:gobblin.compaction.mapreduce.avro.AvroKeyRecursiveCombineFileInputFormat.java
License:Apache License
private void setSplitSize(JobContext cx) { super.setMaxSplitSize(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE, DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE)); super.setMinSplitSizeNode(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE, DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE)); }