List of usage examples for org.apache.hadoop.mapreduce Job getJobID
public JobID getJobID()
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java
License:Apache License
static JobContext getNamedJobContext(JobContext context, String namedOutput) throws IOException { Job job = getNamedJob(context, namedOutput); return new JobContextImpl(job.getConfiguration(), job.getJobID()); }
From source file:com.architecting.ch07.MapReduceIndexerTool.java
License:Apache License
private String getJobInfo(Job job) { return "jobName: " + job.getJobName() + ", jobId: " + job.getJobID(); }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@Override public boolean run(Job job) throws InterruptedException { JobID jobId = newJobId(new Random().nextInt(Integer.MAX_VALUE)); setJobId(job, jobId);//from www .j ava 2 s . co m LOG.info(MessageFormat.format("starting job using {0}: {1} ({2})", this, job.getJobID(), job.getJobName())); try { runJob(job); return true; } catch (InterruptedException e) { throw e; } catch (Exception e) { LOG.error(MessageFormat.format("exception was occurred while executing job: {0} ({1})", job.getJobID(), job.getJobName()), e); return false; } }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException { assert job.getJobID() != null; TaskID taskId = newMapTaskId(job.getJobID(), 0); Configuration conf = job.getConfiguration(); OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); OutputCommitter committer = output//from w w w . j a v a 2 s . c o m .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0))); boolean succeed = false; committer.setupJob(job); try { if (job.getNumReduceTasks() == 0) { runMap(job, null); } else { try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(), job.getMapOutputValueClass())) { runMap(job, sorter); runReduce(job, sorter); } } committer.commitJob(job); succeed = true; } finally { if (succeed == false) { try { committer.abortJob(job, State.FAILED); } catch (IOException e) { LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(), job.getJobName()), e); } } } }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" })
private void runMap(Job job, KeyValueSorter<?, ?> sorter)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = job.getConfiguration();
InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
List<InputSplit> splits = input.getSplits(job);
int serial = 1;
for (InputSplit split : splits) {
TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0);
Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf);
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$
mapper.getClass().getName(), id, split.getLength()));
}// w w w . j av a 2 s. c om
TaskAttemptContext context = newTaskAttemptContext(conf, id);
// we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached
OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
OutputCommitter committer = output.getOutputCommitter(context);
committer.setupTask(context);
boolean succeed = false;
try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) {
RecordWriter<?, ?> writer;
if (sorter != null) {
writer = new ShuffleWriter(sorter);
} else {
writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
}
try {
Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split);
reader.initialize(split, c);
mapper.run(c);
} finally {
writer.close(newTaskAttemptContext(conf, id));
}
doCommitTask(context, committer);
succeed = true;
} finally {
if (succeed == false) {
doAbortTask(context, committer);
}
}
}
}
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" })
private void runReduce(Job job, KeyValueSorter<?, ?> sorter)
throws ClassNotFoundException, IOException, InterruptedException {
Configuration conf = job.getConfiguration();
OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
TaskAttemptID id = newTaskAttemptId(newReduceTaskId(job.getJobID(), 1), 0);
Reducer<?, ?, ?, ?> reducer = ReflectionUtils.newInstance(job.getReducerClass(), conf);
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format("starting reducer: {0}@{1} ({2}records, {3}bytes)", //$NON-NLS-1$
reducer.getClass().getName(), id, sorter.getRecordCount(), sorter.getSizeInBytes()));
}//from ww w . java 2 s . co m
TaskAttemptContext context = newTaskAttemptContext(conf, id);
OutputCommitter committer = output.getOutputCommitter(context);
committer.setupTask(context);
boolean succeed = false;
try {
ShuffleReader reader = new ShuffleReader(sorter, new Progress());
try {
RecordWriter<?, ?> writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
try {
Reducer.Context c = newReducerContext(conf, id, reader, sorter.getKeyClass(),
sorter.getValueClass(), writer, committer, (RawComparator) job.getGroupingComparator());
reducer.run(c);
} finally {
writer.close(newTaskAttemptContext(conf, id));
}
} finally {
try {
reader.close();
} catch (IOException e) {
LOG.warn(MessageFormat.format("error occurred while reducer mapper input: {0} ({1})", id,
job.getJobName()), e);
}
}
doCommitTask(context, committer);
succeed = true;
} finally {
if (succeed == false) {
doAbortTask(context, committer);
}
}
}
From source file:com.asakusafw.runtime.stage.AbstractStageClient.java
License:Apache License
private int submit(Job job) throws IOException, InterruptedException, ClassNotFoundException { String jobRunnerClassName = job.getConfiguration().get(StageConstants.PROP_JOB_RUNNER); JobRunner runner = DefaultJobRunner.INSTANCE; if (jobRunnerClassName != null) { Class<?> jobRunnerClass = job.getConfiguration().getClassByName(jobRunnerClassName); runner = (JobRunner) ReflectionUtils.newInstance(jobRunnerClass, job.getConfiguration()); }/* ww w . j av a 2 s . co m*/ LOG.info(MessageFormat.format("Submitting Job: {0} (runner: {1})", job.getJobName(), runner)); long start = System.currentTimeMillis(); boolean succeed; if (RuntimeContext.get().isSimulation()) { LOG.info(MessageFormat.format( "Job is skipped because current execution status is in simulation mode: name={0}", job.getJobName())); succeed = true; } else { succeed = runner.run(job); } long end = System.currentTimeMillis(); LOG.info(MessageFormat.format("Job Finished: elapsed=[{3}]ms, succeed={2}, id={0}, name={1}", job.getJobID(), job.getJobName(), succeed, String.valueOf(end - start))); return succeed ? ToolLauncher.JOB_SUCCEEDED : ToolLauncher.JOB_FAILED; }
From source file:com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java
License:Apache License
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName))); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); if (hbaseColumnsMapping == null) { throw new IOException("hbase.columns.mapping required for HBase Table."); }//from w ww. j a va2 s .c om List<String> hbaseColumnFamilies = new ArrayList<String>(); List<String> hbaseColumnQualifiers = new ArrayList<String>(); List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>(); List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>(); int iKey; try { iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); } catch (SerDeException se) { throw new IOException(se); } Scan scan = new Scan(); // Take filter pushdown into account while calculating splits; this // allows us to prune off regions immediately. Note that although // the Javadoc for the superclass getSplits says that it returns one // split per region, the implementation actually takes the scan // definition into account and excludes regions which don't satisfy // the start/stop row conditions (HBASE-1829). convertFilter(jobConf, scan, null, iKey); // REVIEW: are we supposed to be applying the getReadColumnIDs // same as in getRecordReader? for (int i = 0; i < hbaseColumnFamilies.size(); i++) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } } setScan(scan); Job job = new Job(jobConf); JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID()); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]); } return results; }
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" })
private String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation)
throws IOException, SAXException, ParserConfigurationException {
StringWriter xml = new StringWriter();
createConfigXML(xml, job.jobtitle, job.inputCollection,
getQueryOrProcessing(job.query, QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(),
job._id.toString(), job.outputCollectionTemp, job.mapper, job.reducer, job.combiner,
getQueryOrProcessing(job.query, QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue,
job.arguments);/*from w w w.j a v a 2s.com*/
ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader();
URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
savedClassLoader);
Thread.currentThread().setContextClassLoader(child);
// Now load the XML into a configuration object:
Configuration config = new Configuration();
try {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes()));
NodeList nList = doc.getElementsByTagName("property");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
if (nNode.getNodeType() == Node.ELEMENT_NODE) {
Element eElement = (Element) nNode;
String name = getTagValue("name", eElement);
String value = getTagValue("value", eElement);
if ((null != name) && (null != value)) {
config.set(name, value);
}
}
}
} catch (Exception e) {
throw new IOException(e.getMessage());
}
// Now run the JAR file
try {
config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
if (bLocalMode) {
config.set("mapred.job.tracker", "local");
config.set("fs.default.name", "local");
} else {
String trackerUrl = HadoopUtils.getXMLProperty(
prop_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker");
String fsUrl = HadoopUtils.getXMLProperty(
prop_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name");
config.set("mapred.job.tracker", trackerUrl);
config.set("fs.default.name", fsUrl);
}
Job hj = new Job(config);
Class<?> classToLoad = Class.forName(job.mapper, true, child);
hj.setJarByClass(classToLoad);
hj.setInputFormatClass((Class<? extends InputFormat>) Class
.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child));
if ((null != job.exportToHdfs) && job.exportToHdfs) {
hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child));
Path outPath = this.ensureOutputDirectory(job);
SequenceFileOutputFormat.setOutputPath(hj, outPath);
} else { // normal case, stays in MongoDB
hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
.forName("com.mongodb.hadoop.MongoOutputFormat", true, child));
}
hj.setMapperClass((Class<? extends Mapper>) Class.forName(job.mapper, true, child));
if ((null != job.reducer) && !job.reducer.equalsIgnoreCase("null")
&& !job.reducer.equalsIgnoreCase("none")) {
hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child));
} else {
hj.setNumReduceTasks(0);
}
if ((null != job.combiner) && !job.combiner.equalsIgnoreCase("null")
&& !job.combiner.equalsIgnoreCase("none")) {
hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child));
}
hj.setOutputKeyClass(Class.forName(job.outputKey, true, child));
hj.setOutputValueClass(Class.forName(job.outputValue, true, child));
hj.setJobName(job.jobtitle);
if (bLocalMode) {
hj.waitForCompletion(false);
return "local_done";
} else {
hj.submit();
String jobId = hj.getJobID().toString();
return jobId;
}
} catch (Exception e) {
e.printStackTrace();
Thread.currentThread().setContextClassLoader(savedClassLoader);
return "Error: " + HarvestExceptionUtils.createExceptionMessage(e);
} finally {
Thread.currentThread().setContextClassLoader(savedClassLoader);
}
}
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" })
public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation)
throws IOException, SAXException, ParserConfigurationException {
StringWriter xml = new StringWriter();
String outputCollection = job.outputCollectionTemp;// (non-append mode)
if ((null != job.appendResults) && job.appendResults)
outputCollection = job.outputCollection; // (append mode, write directly in....)
else if (null != job.incrementalMode)
job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode)
createConfigXML(xml, job.jobtitle, job.inputCollection,
InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS),
job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper,
job.reducer, job.combiner,//from www . j av a 2s . c o m
InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY),
job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode,
job.submitterID, job.selfMerge, job.outputCollection, job.appendResults);
ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader();
URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
savedClassLoader);
Thread.currentThread().setContextClassLoader(child);
// Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable
boolean dataModelLoaded = true;
try {
URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
null);
try {
Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest);
} catch (ClassNotFoundException e2) {
//(this is fine, will use the cached version)
dataModelLoaded = false;
}
if (dataModelLoaded)
Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest);
} catch (ClassNotFoundException e1) {
throw new RuntimeException(
"This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards");
}
// Now load the XML into a configuration object:
Configuration config = new Configuration();
// Add the client configuration overrides:
if (!bLocalMode) {
String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/";
config.addResource(new Path(hadoopConfigPath + "core-site.xml"));
config.addResource(new Path(hadoopConfigPath + "mapred-site.xml"));
config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml"));
} //TESTED
try {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes()));
NodeList nList = doc.getElementsByTagName("property");
for (int temp = 0; temp < nList.getLength(); temp++) {
Node nNode = nList.item(temp);
if (nNode.getNodeType() == Node.ELEMENT_NODE) {
Element eElement = (Element) nNode;
String name = getTagValue("name", eElement);
String value = getTagValue("value", eElement);
if ((null != name) && (null != value)) {
config.set(name, value);
}
}
}
} catch (Exception e) {
throw new IOException(e.getMessage());
}
// Some other config defaults:
// (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config)
config.set("mapred.map.tasks.speculative.execution", "false");
config.set("mapred.reduce.tasks.speculative.execution", "false");
// (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera)
// Now run the JAR file
try {
BasicDBObject advancedConfigurationDbo = null;
try {
advancedConfigurationDbo = (null != job.query)
? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query))
: (new BasicDBObject());
} catch (Exception e) {
advancedConfigurationDbo = new BasicDBObject();
}
boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable;
if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) {
throw new RuntimeException(
"Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead.");
}
config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
if (bLocalMode) { // local job tracker and FS mode
config.set("mapred.job.tracker", "local");
config.set("fs.default.name", "local");
} else {
if (bTestMode) { // run job tracker locally but FS mode remotely
config.set("mapred.job.tracker", "local");
} else { // normal job tracker
String trackerUrl = HadoopUtils.getXMLProperty(
props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker");
config.set("mapred.job.tracker", trackerUrl);
}
String fsUrl = HadoopUtils.getXMLProperty(
props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name");
config.set("fs.default.name", fsUrl);
}
if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves
Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
"infinit.e.data_model.jar", config);
DistributedCache.addFileToClassPath(jarToCache, config);
jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
"infinit.e.processing.custom.library.jar", config);
DistributedCache.addFileToClassPath(jarToCache, config);
} //TESTED
// Debug scripts (only if they exist), and only in non local/test mode
if (!bLocalMode && !bTestMode) {
try {
Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
"custom_map_error_handler.sh", config);
config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
DistributedCache.createSymlink(config);
DistributedCache.addCacheFile(scriptToCache.toUri(), config);
} catch (Exception e) {
} // just carry on
try {
Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
"custom_reduce_error_handler.sh", config);
config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
DistributedCache.createSymlink(config);
DistributedCache.addCacheFile(scriptToCache.toUri(), config);
} catch (Exception e) {
} // just carry on
} //TODO (???): TOTEST
// (need to do these 2 things here before the job is created, at which point the config class has been copied across)
//1)
Class<?> mapperClazz = Class.forName(job.mapper, true, child);
if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook
ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz
.newInstance();
preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode));
} //TESTED
//2)
if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {
// Need to download the GridFSZip file
try {
Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/",
"GridFSZipFile.jar", config);
DistributedCache.addFileToClassPath(jarToCache, config);
} catch (Throwable t) {
} // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!)
}
if (job.inputCollection.equals("records")) {
InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo);
//(won't run under 0.19 so running with "records" should cause all sorts of exceptions)
} //TESTED (by hand)
if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place
config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
}
// Manually specified caches
List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"),
job, config, props_custom);
Job hj = new Job(config); // (NOTE: from here, changes to config are ignored)
try {
if (null != localJarCaches) {
if (bLocalMode || bTestMode) {
Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class });
method.setAccessible(true);
method.invoke(child, localJarCaches.toArray());
} //TOTEST (tested logically)
}
Class<?> classToLoad = Class.forName(job.mapper, true, child);
hj.setJarByClass(classToLoad);
if (job.inputCollection.equalsIgnoreCase("filesystem")) {
String inputPath = null;
try {
inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
if (!inputPath.endsWith("/")) {
inputPath = inputPath + "/";
}
} catch (Exception e) {
}
if (null == inputPath) {
throw new RuntimeException("Must specify 'file.url' if reading from filesystem.");
}
inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath);
InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive)
InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB)
InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config);
hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
"com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child));
} else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {
String[] oidStrs = null;
try {
String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)");
Matcher m = oidExtractor.matcher(inputPath);
if (m.find()) {
oidStrs = m.group(1).split("\\s*,\\s*");
} else {
throw new RuntimeException(
"file.url must be in format inf://share/<oid-list>/<string>: " + inputPath);
}
InfiniteHadoopUtils.authenticateShareList(job, oidStrs);
} catch (Exception e) {
throw new RuntimeException(
"Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e);
}
hj.getConfiguration().setStrings("mapred.input.dir", oidStrs);
hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
"com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child));
} else if (job.inputCollection.equals("records")) {
hj.setInputFormatClass((Class<? extends InputFormat>) Class
.forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child));
} else {
if (esMode) {
hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
"com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat",
true, child));
} else {
hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
"com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child));
}
}
if ((null != job.exportToHdfs) && job.exportToHdfs) {
//TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?)
Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom);
if ((null != job.outputKey) && (null != job.outputValue)
&& job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
&& job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
// (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text)
hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
.forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child));
TextOutputFormat.setOutputPath(hj, outPath);
} //TESTED
else {
hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
"org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child));
SequenceFileOutputFormat.setOutputPath(hj, outPath);
} //TESTED
} else { // normal case, stays in MongoDB
hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
"com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child));
}
hj.setMapperClass((Class<? extends Mapper>) mapperClazz);
String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null);
if (null != mapperOutputKeyOverride) {
hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride));
} //TESTED
String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null);
if (null != mapperOutputValueOverride) {
hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride));
} //TESTED
if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null")
&& !job.reducer.equalsIgnoreCase("none")) {
hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child));
// Variable reducers:
if (null != job.query) {
try {
hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1));
} catch (Exception e) {
try {
// (just check it's not a string that is a valid int)
hj.setNumReduceTasks(
Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1")));
} catch (Exception e2) {
}
}
} //TESTED
} else {
hj.setNumReduceTasks(0);
}
if ((null != job.combiner) && !job.combiner.startsWith("#")
&& !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) {
hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child));
}
hj.setOutputKeyClass(Class.forName(job.outputKey, true, child));
hj.setOutputValueClass(Class.forName(job.outputValue, true, child));
hj.setJobName(job.jobtitle);
currJobName = job.jobtitle;
} catch (Error e) { // (messing about with class loaders = lots of chances for errors!)
throw new RuntimeException(e.getMessage(), e);
}
if (bTestMode || bLocalMode) {
hj.submit();
currThreadId = null;
Logger.getRootLogger().addAppender(this);
currLocalJobId = hj.getJobID().toString();
currLocalJobErrs.setLength(0);
while (!hj.isComplete()) {
Thread.sleep(1000);
}
Logger.getRootLogger().removeAppender(this);
if (hj.isSuccessful()) {
if (this.currLocalJobErrs.length() > 0) {
return "local_done: " + this.currLocalJobErrs.toString();
} else {
return "local_done";
}
} else {
return "Error: " + this.currLocalJobErrs.toString();
}
} else {
hj.submit();
String jobId = hj.getJobID().toString();
return jobId;
}
} catch (Exception e) {
e.printStackTrace();
Thread.currentThread().setContextClassLoader(savedClassLoader);
return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e);
} finally {
Thread.currentThread().setContextClassLoader(savedClassLoader);
}
}