List of usage examples for org.apache.hadoop.mapred JobConf setWorkingDirectory
public void setWorkingDirectory(Path dir)
From source file:io.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); }/*from w w w . j av a 2 s. c om*/ final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0);// Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }
From source file:org.apache.ambari.servicemonitor.unit.BaseLocalClusterTestCase.java
License:Apache License
/** * Bond the job configuration to the directory set up for this class for data * @param tc test case/* ww w. jav a 2 s. c o m*/ * @param testname test name * @param jobConf job conf to configure */ protected void bondDataOutputDir(BaseLocalClusterTestCase tc, String testname, JobConf jobConf) { Path datadir = new Path(getDataDir(tc, testname)); jobConf.setWorkingDirectory(new Path(datadir, "working")); jobConf.set(HadoopKeys.MAPRED_OUTPUT_DIR, new Path(datadir, "output").toString()); }
From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); }//from w ww.j a v a2 s. c o m final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }
From source file:org.apache.hcatalog.hbase.TestHBaseBulkOutputFormat.java
License:Apache License
@Test public void hbaseBulkOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException { String testName = "hbaseBulkOutputFormatTest"; Path methodTestDir = new Path(getTestDir(), testName); LOG.info("starting: " + testName); String tableName = newTableName(testName).toLowerCase(); String familyName = "my_family"; byte[] familyNameBytes = Bytes.toBytes(familyName); //include hbase config in conf file Configuration conf = new Configuration(allConf); //create table conf.set(HBaseConstants.PROPERTY_OUTPUT_TABLE_NAME_KEY, tableName); conf.set("yarn.scheduler.capacity.root.queues", "default"); conf.set("yarn.scheduler.capacity.root.default.capacity", "100"); createTable(tableName, new String[] { familyName }); String data[] = { "1,english:one,spanish:uno", "2,english:two,spanish:dos", "3,english:three,spanish:tres" }; // input/output settings Path inputPath = new Path(methodTestDir, "mr_input"); FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt")); for (String line : data) os.write(Bytes.toBytes(line + "\n")); os.close();/*from ww w.j av a2s . c om*/ Path interPath = new Path(methodTestDir, "inter"); //create job JobConf job = new JobConf(conf); job.setWorkingDirectory(new Path(methodTestDir, "mr_work")); job.setJarByClass(this.getClass()); job.setMapperClass(MapWriteOldMapper.class); job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath); job.setOutputFormat(HBaseBulkOutputFormat.class); org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath(job, interPath); job.setOutputCommitter(HBaseBulkOutputCommitter.class); //manually create transaction RevisionManager rm = HBaseRevisionManagerUtil.getOpenedRevisionManager(conf); try { OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null); Transaction txn = rm.beginWriteTransaction(tableName, Arrays.asList(familyName)); outputJobInfo.getProperties().setProperty(HBaseConstants.PROPERTY_WRITE_TXN_KEY, HCatUtil.serialize(txn)); job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo)); } finally { rm.close(); } job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(HCatRecord.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); //verify HTable table = new HTable(conf, tableName); Scan scan = new Scan(); scan.addFamily(familyNameBytes); ResultScanner scanner = table.getScanner(scan); int index = 0; for (Result result : scanner) { String vals[] = data[index].toString().split(","); for (int i = 1; i < vals.length; i++) { String pair[] = vals[i].split(":"); assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0]))); assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0])))); } index++; } //test if load count is the same assertEquals(data.length, index); //test if scratch directory was erased assertFalse(FileSystem.get(job).exists(interPath)); }
From source file:org.apache.hcatalog.hbase.TestHBaseDirectOutputFormat.java
License:Apache License
@Test public void directOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException { String testName = "directOutputFormatTest"; Path methodTestDir = new Path(getTestDir(), testName); String tableName = newTableName(testName).toLowerCase(); String familyName = "my_family"; byte[] familyNameBytes = Bytes.toBytes(familyName); //include hbase config in conf file Configuration conf = new Configuration(allConf); conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(allConf.getAllProperties())); //create table createTable(tableName, new String[] { familyName }); String data[] = { "1,english:ONE,spanish:UNO", "2,english:ONE,spanish:DOS", "3,english:ONE,spanish:TRES" }; // input/output settings Path inputPath = new Path(methodTestDir, "mr_input"); getFileSystem().mkdirs(inputPath);/*from ww w . j a v a2 s.c o m*/ FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt")); for (String line : data) os.write(Bytes.toBytes(line + "\n")); os.close(); //create job JobConf job = new JobConf(conf); job.setJobName(testName); job.setWorkingDirectory(new Path(methodTestDir, "mr_work")); job.setJarByClass(this.getClass()); job.setMapperClass(MapWrite.class); job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath); job.setOutputFormat(HBaseDirectOutputFormat.class); job.set(TableOutputFormat.OUTPUT_TABLE, tableName); job.set(HBaseConstants.PROPERTY_OUTPUT_TABLE_NAME_KEY, tableName); //manually create transaction RevisionManager rm = HBaseRevisionManagerUtil.getOpenedRevisionManager(conf); try { OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null); Transaction txn = rm.beginWriteTransaction(tableName, Arrays.asList(familyName)); outputJobInfo.getProperties().setProperty(HBaseConstants.PROPERTY_WRITE_TXN_KEY, HCatUtil.serialize(txn)); job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo)); } finally { rm.close(); } job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(HCatRecord.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); //verify HTable table = new HTable(conf, tableName); Scan scan = new Scan(); scan.addFamily(familyNameBytes); ResultScanner scanner = table.getScanner(scan); int index = 0; for (Result result : scanner) { String vals[] = data[index].toString().split(","); for (int i = 1; i < vals.length; i++) { String pair[] = vals[i].split(":"); assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0]))); assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0])))); } index++; } assertEquals(data.length, index); }
From source file:org.apache.hive.hcatalog.hbase.TestHiveHBaseTableOutputFormat.java
License:Apache License
@Test public void directOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException { String testName = "directOutputFormatTest"; Path methodTestDir = new Path(getTestDir(), testName); String tableName = newTableName(testName).toLowerCase(); String familyName = "my_family"; byte[] familyNameBytes = Bytes.toBytes(familyName); //include hbase config in conf file Configuration conf = new Configuration(allConf); conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(allConf.getAllProperties())); //create table createTable(tableName, new String[] { familyName }); String data[] = { "1,english:ONE,spanish:UNO", "2,english:TWO,spanish:DOS", "3,english:THREE,spanish:TRES" }; // input/output settings Path inputPath = new Path(methodTestDir, "mr_input"); getFileSystem().mkdirs(inputPath);/* w w w . ja v a 2 s.c o m*/ FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt")); for (String line : data) os.write(Bytes.toBytes(line + "\n")); os.close(); //create job JobConf job = new JobConf(conf); job.setJobName(testName); job.setWorkingDirectory(new Path(methodTestDir, "mr_work")); job.setJarByClass(this.getClass()); job.setMapperClass(MapWrite.class); job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath); // why we need to set all the 3 properties?? job.setOutputFormat(HiveHBaseTableOutputFormat.class); job.set(HBaseSerDe.HBASE_TABLE_NAME, tableName); job.set(TableOutputFormat.OUTPUT_TABLE, tableName); job.set(HCatConstants.HCAT_DEFAULT_TOPIC_PREFIX + ".hbase.mapreduce.outputTableName", tableName); try { OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null); job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo)); } catch (Exception ex) { throw new IOException("Serialization error " + ex.getMessage(), ex); } job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(HCatRecord.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0); System.getProperty("java.classpath"); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); //verify HTable table = new HTable(conf, tableName); Scan scan = new Scan(); scan.addFamily(familyNameBytes); ResultScanner scanner = table.getScanner(scan); int index = 0; for (Result result : scanner) { String vals[] = data[index].toString().split(","); for (int i = 1; i < vals.length; i++) { String pair[] = vals[i].split(":"); assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0]))); assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0])))); } index++; } assertEquals(data.length, index); }
From source file:org.pentaho.di.job.entries.hadoopjobexecutor.JobEntryHadoopJobExecutor.java
License:Apache License
public Result execute(Result result, int arg1) throws KettleException { result.setNrErrors(0);// w w w. j a va 2 s .co m Log4jFileAppender appender = null; String logFileName = "pdi-" + this.getName(); //$NON-NLS-1$ String hadoopDistro = System.getProperty("hadoop.distribution.name", hadoopDistribution); hadoopDistro = environmentSubstitute(hadoopDistro); if (Const.isEmpty(hadoopDistro)) { hadoopDistro = "generic"; } try { appender = LogWriter.createFileAppender(logFileName, true, false); LogWriter.getInstance().addAppender(appender); log.setLogLevel(parentJob.getLogLevel()); } catch (Exception e) { logError(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.FailedToOpenLogFile", logFileName, //$NON-NLS-1$ e.toString())); logError(Const.getStackTracker(e)); } try { URL resolvedJarUrl = null; String jarUrlS = environmentSubstitute(jarUrl); if (jarUrlS.indexOf("://") == -1) { // default to file:// File jarFile = new File(jarUrlS); resolvedJarUrl = jarFile.toURI().toURL(); } else { resolvedJarUrl = new URL(jarUrlS); } final String cmdLineArgsS = environmentSubstitute(cmdLineArgs); if (log.isDetailed()) logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.ResolvedJar", resolvedJarUrl.toExternalForm())); if (isSimple) { /* final AtomicInteger taskCount = new AtomicInteger(0); final AtomicInteger successCount = new AtomicInteger(0); final AtomicInteger failedCount = new AtomicInteger(0); */ if (log.isDetailed()) logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.SimpleMode")); List<Class<?>> classesWithMains = JarUtility .getClassesInJarWithMain(resolvedJarUrl.toExternalForm(), getClass().getClassLoader()); for (final Class<?> clazz : classesWithMains) { Runnable r = new Runnable() { public void run() { try { final ClassLoader cl = Thread.currentThread().getContextClassLoader(); try { // taskCount.incrementAndGet(); Thread.currentThread().setContextClassLoader(clazz.getClassLoader()); Method mainMethod = clazz.getMethod("main", new Class[] { String[].class }); Object[] args = (cmdLineArgsS != null) ? new Object[] { cmdLineArgsS.split(" ") } : new Object[0]; mainMethod.invoke(null, args); } finally { Thread.currentThread().setContextClassLoader(cl); // successCount.incrementAndGet(); // taskCount.decrementAndGet(); } } catch (Throwable ignored) { // skip, try the next one // logError(ignored.getMessage()); // failedCount.incrementAndGet(); ignored.printStackTrace(); } } }; Thread t = new Thread(r); t.start(); } // uncomment to implement blocking /* if (blocking) { while (taskCount.get() > 0 && !parentJob.isStopped()) { Thread.sleep(1000); } if (!parentJob.isStopped()) { result.setResult(successCount.get() > 0); result.setNrErrors((successCount.get() > 0) ? 0 : 1); } else { // we can't really know at this stage if // the hadoop job will finish successfully // because we have to stop now result.setResult(true); // look on the bright side of life :-)... result.setNrErrors(0); } } else { */ // non-blocking - just set success equal to no failures arising // from invocation // result.setResult(failedCount.get() == 0); // result.setNrErrors(failedCount.get()); result.setResult(true); result.setNrErrors(0); /* } */ } else { if (log.isDetailed()) logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.AdvancedMode")); URL[] urls = new URL[] { resolvedJarUrl }; URLClassLoader loader = new URLClassLoader(urls, getClass().getClassLoader()); JobConf conf = new JobConf(); String hadoopJobNameS = environmentSubstitute(hadoopJobName); conf.setJobName(hadoopJobNameS); String outputKeyClassS = environmentSubstitute(outputKeyClass); conf.setOutputKeyClass(loader.loadClass(outputKeyClassS)); String outputValueClassS = environmentSubstitute(outputValueClass); conf.setOutputValueClass(loader.loadClass(outputValueClassS)); if (mapperClass != null) { String mapperClassS = environmentSubstitute(mapperClass); Class<? extends Mapper> mapper = (Class<? extends Mapper>) loader.loadClass(mapperClassS); conf.setMapperClass(mapper); } if (combinerClass != null) { String combinerClassS = environmentSubstitute(combinerClass); Class<? extends Reducer> combiner = (Class<? extends Reducer>) loader.loadClass(combinerClassS); conf.setCombinerClass(combiner); } if (reducerClass != null) { String reducerClassS = environmentSubstitute(reducerClass); Class<? extends Reducer> reducer = (Class<? extends Reducer>) loader.loadClass(reducerClassS); conf.setReducerClass(reducer); } if (inputFormatClass != null) { String inputFormatClassS = environmentSubstitute(inputFormatClass); Class<? extends InputFormat> inputFormat = (Class<? extends InputFormat>) loader .loadClass(inputFormatClassS); conf.setInputFormat(inputFormat); } if (outputFormatClass != null) { String outputFormatClassS = environmentSubstitute(outputFormatClass); Class<? extends OutputFormat> outputFormat = (Class<? extends OutputFormat>) loader .loadClass(outputFormatClassS); conf.setOutputFormat(outputFormat); } String hdfsHostnameS = environmentSubstitute(hdfsHostname); String hdfsPortS = environmentSubstitute(hdfsPort); String jobTrackerHostnameS = environmentSubstitute(jobTrackerHostname); String jobTrackerPortS = environmentSubstitute(jobTrackerPort); // See if we can auto detect the distribution first HadoopConfigurer configurer = HadoopConfigurerFactory.locateConfigurer(); if (configurer == null) { // go with what has been selected by the user configurer = HadoopConfigurerFactory.getConfigurer(hadoopDistro); // if the user-specified distribution is detectable, make sure it is still // the current distribution! if (configurer != null && configurer.isDetectable()) { if (!configurer.isAvailable()) { throw new KettleException(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.Error.DistroNoLongerPresent", configurer.distributionName())); } } } if (configurer == null) { throw new KettleException(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.Error.UnknownHadoopDistribution", hadoopDistro)); } logBasic(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.Message.DistroConfigMessage", configurer.distributionName())); List<String> configMessages = new ArrayList<String>(); configurer.configure(hdfsHostnameS, hdfsPortS, jobTrackerHostnameS, jobTrackerPortS, conf, configMessages); for (String m : configMessages) { logBasic(m); } String inputPathS = environmentSubstitute(inputPath); String[] inputPathParts = inputPathS.split(","); List<Path> paths = new ArrayList<Path>(); for (String path : inputPathParts) { paths.add(new Path(configurer.getFilesystemURL() + path)); } Path[] finalPaths = paths.toArray(new Path[paths.size()]); //FileInputFormat.setInputPaths(conf, new Path(configurer.getFilesystemURL() + inputPathS)); FileInputFormat.setInputPaths(conf, finalPaths); String outputPathS = environmentSubstitute(outputPath); FileOutputFormat.setOutputPath(conf, new Path(configurer.getFilesystemURL() + outputPathS)); // process user defined values for (UserDefinedItem item : userDefined) { if (item.getName() != null && !"".equals(item.getName()) && item.getValue() != null && !"".equals(item.getValue())) { String nameS = environmentSubstitute(item.getName()); String valueS = environmentSubstitute(item.getValue()); conf.set(nameS, valueS); } } String workingDirectoryS = environmentSubstitute(workingDirectory); conf.setWorkingDirectory(new Path(configurer.getFilesystemURL() + workingDirectoryS)); conf.setJar(jarUrl); String numMapTasksS = environmentSubstitute(numMapTasks); String numReduceTasksS = environmentSubstitute(numReduceTasks); int numM = 1; try { numM = Integer.parseInt(numMapTasksS); } catch (NumberFormatException e) { logError("Can't parse number of map tasks '" + numMapTasksS + "'. Setting num" + "map tasks to 1"); } int numR = 1; try { numR = Integer.parseInt(numReduceTasksS); } catch (NumberFormatException e) { logError("Can't parse number of reduce tasks '" + numReduceTasksS + "'. Setting num" + "reduce tasks to 1"); } conf.setNumMapTasks(numM); conf.setNumReduceTasks(numR); JobClient jobClient = new JobClient(conf); RunningJob runningJob = jobClient.submitJob(conf); String loggingIntervalS = environmentSubstitute(loggingInterval); int logIntv = 60; try { logIntv = Integer.parseInt(loggingIntervalS); } catch (NumberFormatException e) { logError("Can't parse logging interval '" + loggingIntervalS + "'. Setting " + "logging interval to 60"); } if (blocking) { try { int taskCompletionEventIndex = 0; while (!parentJob.isStopped() && !runningJob.isComplete()) { if (logIntv >= 1) { printJobStatus(runningJob); taskCompletionEventIndex = logTaskMessages(runningJob, taskCompletionEventIndex); Thread.sleep(logIntv * 1000); } else { Thread.sleep(60000); } } if (parentJob.isStopped() && !runningJob.isComplete()) { // We must stop the job running on Hadoop runningJob.killJob(); // Indicate this job entry did not complete result.setResult(false); } printJobStatus(runningJob); // Log any messages we may have missed while polling logTaskMessages(runningJob, taskCompletionEventIndex); } catch (InterruptedException ie) { logError(ie.getMessage(), ie); } // Entry is successful if the MR job is successful overall result.setResult(runningJob.isSuccessful()); } } } catch (Throwable t) { t.printStackTrace(); result.setStopped(true); result.setNrErrors(1); result.setResult(false); logError(t.getMessage(), t); } if (appender != null) { LogWriter.getInstance().removeAppender(appender); appender.close(); ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_LOG, appender.getFile(), parentJob.getJobname(), getName()); result.getResultFiles().put(resultFile.getFile().toString(), resultFile); } return result; }
From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java
License:Open Source License
public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile, String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort) throws IOException, KettleException { JobConf conf = new JobConf(); conf.setJobName("wordcount"); KettleEnvironment.init();//from ww w. j a v a2 s.c om // Register Map/Reduce Input and Map/Reduce Output plugin steps PluginMainClassType mainClassTypesAnnotation = StepPluginType.class .getAnnotation(PluginMainClassType.class); Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>(); inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName()); PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input", "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin); Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>(); outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName()); PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output", "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin); TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta transMeta = null; TransConfiguration transConfig = null; if (mapperTransformationFile != null) { conf.setMapRunnerClass(PentahoMapRunnable.class); transMeta = new TransMeta(mapperTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-map-xml", transConfig.getXML()); conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); } if (combinerTransformationFile != null) { conf.setCombinerClass(GenericTransCombiner.class); transMeta = new TransMeta(combinerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-combiner-xml", transConfig.getXML()); conf.set("transformation-combiner-input-stepname", "Injector"); conf.set("transformation-combiner-output-stepname", "Output"); } if (reducerTransformationFile != null) { conf.setReducerClass((Class<? extends Reducer>) GenericTransReduce.class); transMeta = new TransMeta(reducerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-reduce-xml", transConfig.getXML()); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); } conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path("/")); FileOutputFormat.setOutputPath(conf, new Path("/")); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJar(jar.toURI().toURL().toExternalForm()); conf.setWorkingDirectory(new Path("/tmp/wordcount")); return conf; }
From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIntegrationTest.java
License:Apache License
public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile, String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort) throws IOException, KettleException { JobConf conf = new JobConf(); conf.setJobName("wordcount"); KettleEnvironment.init();/*from w ww . j a v a 2 s.com*/ // Register Map/Reduce Input and Map/Reduce Output plugin steps PluginMainClassType mainClassTypesAnnotation = StepPluginType.class .getAnnotation(PluginMainClassType.class); Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>(); inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName()); PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input", "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin); Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>(); outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName()); PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output", "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin); TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta transMeta = null; TransConfiguration transConfig = null; if (mapperTransformationFile != null) { conf.setMapRunnerClass(PentahoMapRunnable.class); transMeta = new TransMeta(mapperTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-map-xml", transConfig.getXML()); conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); } if (combinerTransformationFile != null) { conf.setCombinerClass(GenericTransCombiner.class); transMeta = new TransMeta(combinerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-combiner-xml", transConfig.getXML()); conf.set("transformation-combiner-input-stepname", "Injector"); conf.set("transformation-combiner-output-stepname", "Output"); } if (reducerTransformationFile != null) { conf.setReducerClass(GenericTransReduce.class); transMeta = new TransMeta(reducerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-reduce-xml", transConfig.getXML()); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); } conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path("/")); FileOutputFormat.setOutputPath(conf, new Path("/")); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJar(jar.toURI().toURL().toExternalForm()); conf.setWorkingDirectory(new Path("/tmp/wordcount")); return conf; }
From source file:org.pentaho.hadoop.mapreduce.test.TestSubmitMapReduceJob.java
License:Open Source License
@Test public void submitJob() throws Exception { String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input", "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" }; JobConf conf = new JobConf(); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./test-res/pentaho-mapreduce-sample.jar"); URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() }); conf.setMapperClass(/*from w ww . j a va 2s.c o m*/ (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Map")); conf.setCombinerClass((Class<? extends Reducer>) loader .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce")); conf.setReducerClass((Class<? extends Reducer>) loader .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce")); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJarByClass(loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount")); conf.setWorkingDirectory(new Path("/tmp/wordcount")); JobClient jobClient = new JobClient(conf); ClusterStatus status = jobClient.getClusterStatus(); assertEquals(State.RUNNING, status.getJobTrackerState()); RunningJob runningJob = jobClient.submitJob(conf); System.out.print("Running " + runningJob.getJobName() + ""); while (!runningJob.isComplete()) { System.out.print("."); Thread.sleep(500); } System.out.println(); System.out.println("Finished " + runningJob.getJobName() + "."); FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000")); String output = IOUtils.toString(file.getContent().getInputStream()); assertEquals("Bye\t1\nGoodbye\t1\nHadoop\t2\nHello\t2\nWorld\t2\n", output); }