List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:gobblin.runtime.mapreduce.GobblinOutputCommitter.java
License:Apache License
@Override public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException { LOG.info("Aborting Job: " + jobContext.getJobID() + " with state: " + state); Configuration conf = jobContext.getConfiguration(); URI fsUri = URI.create(conf.get(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI)); FileSystem fs = FileSystem.get(fsUri, conf); Path mrJobDir = new Path(conf.get(ConfigurationKeys.MR_JOB_ROOT_DIR_KEY), conf.get(ConfigurationKeys.JOB_NAME_KEY)); Path jobInputDir = new Path(mrJobDir, MRJobLauncher.INPUT_DIR_NAME); if (!fs.exists(jobInputDir) || !fs.isDirectory(jobInputDir)) { LOG.warn(String.format("%s either does not exist or is not a directory. No data to cleanup.", jobInputDir));/*from ww w.ja v a 2s . c o m*/ return; } // Iterate through all files in the jobInputDir, each file should correspond to a serialized wu or mwu try { for (FileStatus status : fs.listStatus(jobInputDir, new WorkUnitFilter())) { Closer workUnitFileCloser = Closer.create(); // If the file ends with ".wu" de-serialize it into a WorkUnit if (status.getPath().getName().endsWith(AbstractJobLauncher.WORK_UNIT_FILE_EXTENSION)) { WorkUnit wu = WorkUnit.createEmpty(); try { wu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath())))); } finally { workUnitFileCloser.close(); } JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG); } // If the file ends with ".mwu" de-serialize it into a MultiWorkUnit if (status.getPath().getName().endsWith(AbstractJobLauncher.MULTI_WORK_UNIT_FILE_EXTENSION)) { MultiWorkUnit mwu = MultiWorkUnit.createEmpty(); try { mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath())))); } finally { workUnitFileCloser.close(); } for (WorkUnit wu : mwu.getWorkUnits()) { JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(wu), LOG); } } } } finally { try { cleanUpWorkingDirectory(mrJobDir, fs); } finally { super.abortJob(jobContext, state); } } }
From source file:gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Path[] inputPaths = FileInputFormat.getInputPaths(context); if (inputPaths == null || inputPaths.length == 0) { throw new IOException("No input found!"); }//from w w w . j a va 2 s . co m List<String> allPaths = Lists.newArrayList(); for (Path path : inputPaths) { // path is a single work unit / multi work unit FileSystem fs = path.getFileSystem(context.getConfiguration()); FileStatus[] inputs = fs.listStatus(path); if (inputs == null) { throw new IOException(String.format("Path %s does not exist.", path)); } log.info(String.format("Found %d input files at %s: %s", inputs.length, path, Arrays.toString(inputs))); for (FileStatus input : inputs) { allPaths.add(input.getPath().toString()); } } int maxMappers = getMaxMapper(context.getConfiguration()); int numTasksPerMapper = allPaths.size() % maxMappers == 0 ? allPaths.size() / maxMappers : allPaths.size() / maxMappers + 1; List<InputSplit> splits = Lists.newArrayList(); Iterator<String> pathsIt = allPaths.iterator(); while (pathsIt.hasNext()) { Iterator<String> limitedIterator = Iterators.limit(pathsIt, numTasksPerMapper); splits.add(new GobblinSplit(Lists.newArrayList(limitedIterator))); } return splits; }
From source file:gov.llnl.ontology.text.hbase.GzipTarInputFormat.java
License:Open Source License
/** * Returns a {@link List} of {@link FileSplit}s. Each {@link FileSplit} * will be a gzipped tarball of xml documents. Each tarred file should * contain a single document./*from w ww .jav a2s . co m*/ */ public List<InputSplit> getSplits(JobContext context) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); // Get the list of zipped files to be processed and add each zipped file // as an InputSplit. FileSystem fs = FileSystem.get(context.getConfiguration()); for (Path file : getInputPaths(context)) { // Check that the list of files exists. Throw an exception if it // does not. if (fs.isDirectory(file) || !fs.exists(file)) throw new IOException("File does not exist: " + file); // Read the contents of the file list and add each line as a // FileSplit. BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(file))); for (String line = null; (line = br.readLine()) != null;) splits.add(new FileSplit(new Path(line), 0, Integer.MAX_VALUE, null)); } return splits; }
From source file:gr.ntua.h2rdf.inputFormat.FileTableInputFormat.java
License:Open Source License
public List<InputSplit> getSplits(JobContext context) throws IOException { List<InputSplit> splits = super.getSplits(context); List<InputSplit> spl = textFormat.getSplits(context); splits.addAll(spl);//from w w w . j a v a2 s . co m String p = context.getConfiguration().get("mapred.fairscheduler.pool"); int max = Integer.parseInt(p.substring(p.indexOf("l") + 1)); if (splits.size() <= max) context.getConfiguration().setInt("mapred.reduce.tasks", splits.size()); else context.getConfiguration().setInt("mapred.reduce.tasks", max); return splits; }
From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java
License:Open Source License
/** * Get the minimum split size/*from ww w .ja v a2 s .c o m*/ * @param job the job * @return the minimum number of bytes that can be in a split */ public static long getMinSplitSize(JobContext job) { return job.getConfiguration().getLong("mapred.min.split.size", 1L); }
From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java
License:Open Source License
/** * Get the maximum split size./* w w w .j a va 2s. com*/ * @param context the job to look at. * @return the maximum number of bytes a split can include */ public static long getMaxSplitSize(JobContext context) { return context.getConfiguration().getLong("mapred.max.split.size", Long.MAX_VALUE); }
From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java
License:Open Source License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. //w ww . ja v a2 s . co m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java
License:Open Source License
/** * Generate the list of files and make them into FileSplits. *///from w w w .j a va2 s . c o m public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts())); } else { //Create empty hosts array for zero length files splits.add(new MyFileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); String p = job.getConfiguration().get("mapred.fairscheduler.pool"); int max = Integer.parseInt(p.substring(p.indexOf("l") + 1)); if (splits.size() <= max) job.getConfiguration().setInt("mapred.reduce.tasks", splits.size()); else job.getConfiguration().setInt("mapred.reduce.tasks", max); return splits; }
From source file:gr.ntua.h2rdf.inputFormat.TableInputFormatBase.java
License:Open Source License
/** * Calculates the splits that will serve as input for the map tasks. The * number of splits matches the number of regions in a table. * * @param context The current job context. * @return The list of input splits.//from ww w.ja v a2s. c o m * @throws IOException When creating the list of splits fails. * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( * org.apache.hadoop.mapreduce.JobContext) */ @Override public List<InputSplit> getSplits(JobContext context) throws IOException { String p = context.getConfiguration().get("mapred.fairscheduler.pool"); max_tasks = Integer.parseInt(p.substring(p.indexOf("l") + 1)); Iterator<Scan> scanIterator = scanList.iterator(); Iterator<String> tableIterator = tableList.iterator(); Iterator<String> varsIterator = varList.iterator(); Iterator<String> fnameIterator = fnameList.iterator(); try { SUBCLASS = (new H2RDFNode(Node.createURI("http://www.w3.org/2000/01/rdf-schema#subClassOf"))) .getHashValue(); } catch (NotSupportedDatatypeException e) { throw new IOException("Not supported datatype"); } System.out.println("calculating splitnumber"); conf = context.getConfiguration(); Configuration HBconf = HBaseConfiguration.create(); Scan scan = null; splits = new ArrayList<InputSplit>(); while (scanIterator.hasNext()) { System.out.println("New Input BGP"); scan = scanIterator.next(); String tname = tableIterator.next(); table = new HTable(HBconf, tname); keys = table.getStartEndKeys(); String vars = varsIterator.next(); String fname = fnameIterator.next(); splitSubclass(scan, tname, vars, fname); } if (splits.size() <= max_tasks) context.getConfiguration().setInt("mapred.reduce.tasks", splits.size()); else context.getConfiguration().setInt("mapred.reduce.tasks", max_tasks); return splits; }
From source file:gr.ntua.h2rdf.inputFormat2.MultiTableInputFormatBase.java
License:Open Source License
/** * Calculates the splits that will serve as input for the map tasks. The * number of splits matches the number of regions in a table. * * @param context The current job context. * @return The list of input splits./*from ww w .jav a 2s. co m*/ * @throws IOException When creating the list of splits fails. * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext) */ @Override public List<InputSplit> getSplits(JobContext context) throws IOException { if (scans.isEmpty()) { throw new IOException("No scans were provided."); } List<InputSplit> splits = new ArrayList<InputSplit>(); for (Scan scan : scans) { byte[] tableName = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME); if (tableName == null) throw new IOException("A scan object did not have a table name"); HTable table = new HTable(context.getConfiguration(), tableName); Pair<byte[][], byte[][]> keys = table.getStartEndKeys(); if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) { throw new IOException("Expecting at least one region for table : " + Bytes.toString(tableName)); } int count = 0; byte[] startRow = scan.getStartRow(); byte[] stopRow = scan.getStopRow(); for (int i = 0; i < keys.getFirst().length; i++) { if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) { continue; } String regionLocation = table.getRegionLocation(keys.getFirst()[i], false).getHostname(); // determine if the given start and stop keys fall into the range if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) { byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow; byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow; InputSplit split = new TableSplit(tableName, scan, splitStart, splitStop, regionLocation); splits.add(split); if (LOG.isDebugEnabled()) LOG.debug("getSplits: split -> " + (count++) + " -> " + split); } } table.close(); } return splits; }