List of usage examples for org.apache.hadoop.mapred JobConf setJarByClass
public void setJarByClass(Class cls)
From source file:org.apache.ambari.servicemonitor.jobs.FileUsingJobRunner.java
License:Apache License
public int run(String[] args) throws Exception { // Configuration processed by ToolRunner Configuration conf = getConf(); CommandLine commandLine = getCommandLine(); // Create a JobConf using the processed conf JobConf jobConf = new JobConf(conf, FileUsingJobRunner.class); //tune the config if (jobConf.get(JobKeys.RANGEINPUTFORMAT_ROWS) == null) { jobConf.setInt(JobKeys.RANGEINPUTFORMAT_ROWS, 1); }//w w w. j a v a2 s. co m // Process custom command-line options String name = OptionHelper.getStringOption(commandLine, "n", "File Using Job"); if (commandLine.hasOption('x')) { //delete the output directory String destDir = jobConf.get(JobKeys.MAPRED_OUTPUT_DIR); FileSystem fs = FileSystem.get(jobConf); fs.delete(new Path(destDir), true); } // Specify various job-specific parameters jobConf.setMapperClass(FileUsingMapper.class); jobConf.setReducerClass(FileUsingReducer.class); jobConf.setMapOutputKeyClass(IntWritable.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputFormat(TextOutputFormat.class); jobConf.setInputFormat(RangeInputFormat.class); //jobConf.setPartitionerClass(SleepJob.class); jobConf.setSpeculativeExecution(false); jobConf.setJobName(name); jobConf.setJarByClass(this.getClass()); FileInputFormat.addInputPath(jobConf, new Path("ignored")); // Submit the job, then poll for progress until the job is complete RunningJob runningJob = JobClient.runJob(jobConf); runningJob.waitForCompletion(); return runningJob.isSuccessful() ? 0 : 1; }
From source file:org.apache.hcatalog.hbase.TestHBaseBulkOutputFormat.java
License:Apache License
@Test public void hbaseBulkOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException { String testName = "hbaseBulkOutputFormatTest"; Path methodTestDir = new Path(getTestDir(), testName); LOG.info("starting: " + testName); String tableName = newTableName(testName).toLowerCase(); String familyName = "my_family"; byte[] familyNameBytes = Bytes.toBytes(familyName); //include hbase config in conf file Configuration conf = new Configuration(allConf); //create table conf.set(HBaseConstants.PROPERTY_OUTPUT_TABLE_NAME_KEY, tableName); conf.set("yarn.scheduler.capacity.root.queues", "default"); conf.set("yarn.scheduler.capacity.root.default.capacity", "100"); createTable(tableName, new String[] { familyName }); String data[] = { "1,english:one,spanish:uno", "2,english:two,spanish:dos", "3,english:three,spanish:tres" }; // input/output settings Path inputPath = new Path(methodTestDir, "mr_input"); FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt")); for (String line : data) os.write(Bytes.toBytes(line + "\n")); os.close();//from w w w. ja v a 2s . c om Path interPath = new Path(methodTestDir, "inter"); //create job JobConf job = new JobConf(conf); job.setWorkingDirectory(new Path(methodTestDir, "mr_work")); job.setJarByClass(this.getClass()); job.setMapperClass(MapWriteOldMapper.class); job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath); job.setOutputFormat(HBaseBulkOutputFormat.class); org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath(job, interPath); job.setOutputCommitter(HBaseBulkOutputCommitter.class); //manually create transaction RevisionManager rm = HBaseRevisionManagerUtil.getOpenedRevisionManager(conf); try { OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null); Transaction txn = rm.beginWriteTransaction(tableName, Arrays.asList(familyName)); outputJobInfo.getProperties().setProperty(HBaseConstants.PROPERTY_WRITE_TXN_KEY, HCatUtil.serialize(txn)); job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo)); } finally { rm.close(); } job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(HCatRecord.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); //verify HTable table = new HTable(conf, tableName); Scan scan = new Scan(); scan.addFamily(familyNameBytes); ResultScanner scanner = table.getScanner(scan); int index = 0; for (Result result : scanner) { String vals[] = data[index].toString().split(","); for (int i = 1; i < vals.length; i++) { String pair[] = vals[i].split(":"); assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0]))); assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0])))); } index++; } //test if load count is the same assertEquals(data.length, index); //test if scratch directory was erased assertFalse(FileSystem.get(job).exists(interPath)); }
From source file:org.apache.hcatalog.hbase.TestHBaseDirectOutputFormat.java
License:Apache License
@Test public void directOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException { String testName = "directOutputFormatTest"; Path methodTestDir = new Path(getTestDir(), testName); String tableName = newTableName(testName).toLowerCase(); String familyName = "my_family"; byte[] familyNameBytes = Bytes.toBytes(familyName); //include hbase config in conf file Configuration conf = new Configuration(allConf); conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(allConf.getAllProperties())); //create table createTable(tableName, new String[] { familyName }); String data[] = { "1,english:ONE,spanish:UNO", "2,english:ONE,spanish:DOS", "3,english:ONE,spanish:TRES" }; // input/output settings Path inputPath = new Path(methodTestDir, "mr_input"); getFileSystem().mkdirs(inputPath);// w w w . j ava 2s.co m FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt")); for (String line : data) os.write(Bytes.toBytes(line + "\n")); os.close(); //create job JobConf job = new JobConf(conf); job.setJobName(testName); job.setWorkingDirectory(new Path(methodTestDir, "mr_work")); job.setJarByClass(this.getClass()); job.setMapperClass(MapWrite.class); job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath); job.setOutputFormat(HBaseDirectOutputFormat.class); job.set(TableOutputFormat.OUTPUT_TABLE, tableName); job.set(HBaseConstants.PROPERTY_OUTPUT_TABLE_NAME_KEY, tableName); //manually create transaction RevisionManager rm = HBaseRevisionManagerUtil.getOpenedRevisionManager(conf); try { OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null); Transaction txn = rm.beginWriteTransaction(tableName, Arrays.asList(familyName)); outputJobInfo.getProperties().setProperty(HBaseConstants.PROPERTY_WRITE_TXN_KEY, HCatUtil.serialize(txn)); job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo)); } finally { rm.close(); } job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(HCatRecord.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); //verify HTable table = new HTable(conf, tableName); Scan scan = new Scan(); scan.addFamily(familyNameBytes); ResultScanner scanner = table.getScanner(scan); int index = 0; for (Result result : scanner) { String vals[] = data[index].toString().split(","); for (int i = 1; i < vals.length; i++) { String pair[] = vals[i].split(":"); assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0]))); assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0])))); } index++; } assertEquals(data.length, index); }
From source file:org.apache.hcatalog.hbase.TestHBaseInputFormat.java
License:Apache License
@Test public void TestHBaseInputFormatProjectionReadMR() throws Exception { String tableName = newTableName("mytable"); String tableQuery = "CREATE TABLE " + tableName + "(key string, testqualifier1 string, testqualifier2 string) STORED BY " + "'org.apache.hcatalog.hbase.HBaseHCatStorageHandler'" + "TBLPROPERTIES ('hbase.columns.mapping'=':key," + "testFamily:testQualifier1,testFamily:testQualifier2')"; CommandProcessorResponse responseTwo = hcatDriver.run(tableQuery); assertEquals(0, responseTwo.getResponseCode()); HBaseAdmin hAdmin = new HBaseAdmin(getHbaseConf()); boolean doesTableExist = hAdmin.tableExists(tableName); assertTrue(doesTableExist);//from w ww . j a va 2s .co m populateHBaseTable(tableName, 5); Configuration conf = new Configuration(hcatConf); conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(getHiveConf().getAllProperties())); // output settings Path outputDir = new Path(getTestDir(), "mapred/testHBaseTableProjectionReadMR"); FileSystem fs = getFileSystem(); if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // create job JobConf job = new JobConf(conf); job.setJobName("hbase-scan-column"); job.setJarByClass(this.getClass()); job.setMapperClass(MapReadProjectionHTable.class); job.setInputFormat(HBaseInputFormat.class); //Configure projection schema job.set(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA, HCatUtil.serialize(getProjectionSchema())); Job newJob = new Job(job); HCatInputFormat.setInput(newJob, MetaStoreUtils.DEFAULT_DATABASE_NAME, tableName); String inputJobString = newJob.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO); InputJobInfo info = (InputJobInfo) HCatUtil.deserialize(inputJobString); job.set(HCatConstants.HCAT_KEY_JOB_INFO, inputJobString); for (PartInfo partinfo : info.getPartitions()) { for (Entry<String, String> entry : partinfo.getJobProperties().entrySet()) job.set(entry.getKey(), entry.getValue()); } assertEquals("testFamily:testQualifier1", job.get(TableInputFormat.SCAN_COLUMNS)); job.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class); org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); assertFalse(MapReadProjHTable.error); assertEquals(MapReadProjHTable.count, 1); String dropTableQuery = "DROP TABLE " + tableName; CommandProcessorResponse responseThree = hcatDriver.run(dropTableQuery); assertEquals(0, responseThree.getResponseCode()); boolean isHbaseTableThere = hAdmin.tableExists(tableName); assertFalse(isHbaseTableThere); }
From source file:org.apache.hcatalog.hbase.TestHCatHBaseInputFormat.java
License:Apache License
@Test public void TestHBaseInputFormatProjectionReadMR() throws Exception { String tableName = newTableName("mytable"); String tableQuery = "CREATE TABLE " + tableName + "(key string, testqualifier1 string, testqualifier2 string) STORED BY " + "'org.apache.hcatalog.hbase.HBaseHCatStorageHandler'" + "TBLPROPERTIES ('hbase.columns.mapping'=':key," + "testFamily:testQualifier1,testFamily:testQualifier2')"; CommandProcessorResponse responseTwo = hcatDriver.run(tableQuery); assertEquals(0, responseTwo.getResponseCode()); HBaseAdmin hAdmin = new HBaseAdmin(getHbaseConf()); boolean doesTableExist = hAdmin.tableExists(tableName); assertTrue(doesTableExist);// w ww .ja va 2 s. c o m populateHBaseTable(tableName, 5); Configuration conf = new Configuration(hcatConf); conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(getHiveConf().getAllProperties())); // output settings Path outputDir = new Path(getTestDir(), "mapred/testHBaseInputFormatProjectionReadMR"); FileSystem fs = getFileSystem(); if (fs.exists(outputDir)) { fs.delete(outputDir, true); } // create job JobConf job = new JobConf(conf); job.setJobName("hbase-scan-column"); job.setJarByClass(this.getClass()); job.setMapperClass(MapReadProjectionHTable.class); job.setInputFormat(HBaseInputFormat.class); //Configure projection schema job.set(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA, HCatUtil.serialize(getProjectionSchema())); Job newJob = new Job(job); HCatInputFormat.setInput(newJob, MetaStoreUtils.DEFAULT_DATABASE_NAME, tableName); String inputJobString = newJob.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO); InputJobInfo info = (InputJobInfo) HCatUtil.deserialize(inputJobString); job.set(HCatConstants.HCAT_KEY_JOB_INFO, inputJobString); for (PartInfo partinfo : info.getPartitions()) { for (Entry<String, String> entry : partinfo.getJobProperties().entrySet()) job.set(entry.getKey(), entry.getValue()); } assertEquals("testFamily:testQualifier1", job.get(TableInputFormat.SCAN_COLUMNS)); job.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class); org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); assertFalse(MapReadProjectionHTable.error); assertEquals(1, MapReadProjectionHTable.count); String dropTableQuery = "DROP TABLE " + tableName; CommandProcessorResponse responseThree = hcatDriver.run(dropTableQuery); assertEquals(0, responseThree.getResponseCode()); boolean isHbaseTableThere = hAdmin.tableExists(tableName); assertFalse(isHbaseTableThere); }
From source file:org.apache.hcatalog.hcatmix.load.HadoopLoadGenerator.java
License:Apache License
/** * Prepare input directory/jobConf and launch the hadoop job, for load testing * * @param confFileName The properties file for the task, should be available in the classpath * @param conf/* ww w . jav a 2 s .co m*/ * @return * @throws IOException * @throws MetaException * @throws TException */ public SortedMap<Long, ReduceResult> runLoadTest(String confFileName, Configuration conf) throws Exception, MetaException, TException { JobConf jobConf; if (conf != null) { jobConf = new JobConf(conf); } else { jobConf = new JobConf(new Configuration()); } InputStream confFileIS; try { confFileIS = HCatMixUtils.getInputStream(confFileName); } catch (Exception e) { LOG.error("Couldn't load configuration file " + confFileName); throw e; } Properties props = new Properties(); try { props.load(confFileIS); } catch (IOException e) { LOG.error("Couldn't load properties file: " + confFileName, e); throw e; } LOG.info("Loading configuration file: " + confFileName); addToJobConf(jobConf, props, Conf.MAP_RUN_TIME_MINUTES); addToJobConf(jobConf, props, Conf.STAT_COLLECTION_INTERVAL_MINUTE); addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_COUNT); addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_INTERVAL_MINUTES); addToJobConf(jobConf, props, Conf.THREAD_COMPLETION_BUFFER_MINUTES); int numMappers = Integer .parseInt(props.getProperty(Conf.NUM_MAPPERS.propName, "" + Conf.NUM_MAPPERS.defaultValue)); Path inputDir = new Path(props.getProperty(Conf.INPUT_DIR.propName, Conf.INPUT_DIR.defaultValueStr)); Path outputDir = new Path(props.getProperty(Conf.OUTPUT_DIR.propName, Conf.OUTPUT_DIR.defaultValueStr)); jobConf.setJobName(JOB_NAME); jobConf.setNumMapTasks(numMappers); jobConf.setMapperClass(HCatMapper.class); jobConf.setJarByClass(HCatMapper.class); jobConf.setReducerClass(HCatReducer.class); jobConf.setMapOutputKeyClass(LongWritable.class); jobConf.setMapOutputValueClass(IntervalResult.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(ReduceResult.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.set(Conf.TASK_CLASS_NAMES.getJobConfKey(), props.getProperty(Conf.TASK_CLASS_NAMES.propName, Conf.TASK_CLASS_NAMES.defaultValueStr)); fs = FileSystem.get(jobConf); Path jarRoot = new Path("/tmp/hcatmix_jar_" + new Random().nextInt()); HadoopUtils.uploadClasspathAndAddToJobConf(jobConf, jarRoot); fs.deleteOnExit(jarRoot); FileInputFormat.setInputPaths(jobConf, createInputFiles(inputDir, numMappers)); if (fs.exists(outputDir)) { fs.delete(outputDir, true); } FileOutputFormat.setOutputPath(jobConf, outputDir); // Set up delegation token required for hiveMetaStoreClient in map task HiveConf hiveConf = new HiveConf(HadoopLoadGenerator.class); HiveMetaStoreClient hiveClient = new HiveMetaStoreClient(hiveConf); String tokenStr = hiveClient.getDelegationToken(UserGroupInformation.getCurrentUser().getUserName(), "mapred"); Token<? extends AbstractDelegationTokenIdentifier> token = new Token<DelegationTokenIdentifier>(); token.decodeFromUrlString(tokenStr); token.setService(new Text(METASTORE_TOKEN_SIGNATURE)); jobConf.getCredentials().addToken(new Text(METASTORE_TOKEN_KEY), token); // Submit the job, once the job is complete see output LOG.info("Submitted hadoop job"); RunningJob j = JobClient.runJob(jobConf); LOG.info("JobID is: " + j.getJobName()); if (!j.isSuccessful()) { throw new IOException("Job failed"); } return readResult(outputDir, jobConf); }
From source file:org.apache.hive.hcatalog.hbase.TestHiveHBaseTableOutputFormat.java
License:Apache License
@Test public void directOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException { String testName = "directOutputFormatTest"; Path methodTestDir = new Path(getTestDir(), testName); String tableName = newTableName(testName).toLowerCase(); String familyName = "my_family"; byte[] familyNameBytes = Bytes.toBytes(familyName); //include hbase config in conf file Configuration conf = new Configuration(allConf); conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(allConf.getAllProperties())); //create table createTable(tableName, new String[] { familyName }); String data[] = { "1,english:ONE,spanish:UNO", "2,english:TWO,spanish:DOS", "3,english:THREE,spanish:TRES" }; // input/output settings Path inputPath = new Path(methodTestDir, "mr_input"); getFileSystem().mkdirs(inputPath);//from ww w . j av a 2s .co m FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt")); for (String line : data) os.write(Bytes.toBytes(line + "\n")); os.close(); //create job JobConf job = new JobConf(conf); job.setJobName(testName); job.setWorkingDirectory(new Path(methodTestDir, "mr_work")); job.setJarByClass(this.getClass()); job.setMapperClass(MapWrite.class); job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath); // why we need to set all the 3 properties?? job.setOutputFormat(HiveHBaseTableOutputFormat.class); job.set(HBaseSerDe.HBASE_TABLE_NAME, tableName); job.set(TableOutputFormat.OUTPUT_TABLE, tableName); job.set(HCatConstants.HCAT_DEFAULT_TOPIC_PREFIX + ".hbase.mapreduce.outputTableName", tableName); try { OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null); job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo)); } catch (Exception ex) { throw new IOException("Serialization error " + ex.getMessage(), ex); } job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(HCatRecord.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0); System.getProperty("java.classpath"); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); //verify HTable table = new HTable(conf, tableName); Scan scan = new Scan(); scan.addFamily(familyNameBytes); ResultScanner scanner = table.getScanner(scan); int index = 0; for (Result result : scanner) { String vals[] = data[index].toString().split(","); for (int i = 1; i < vals.length; i++) { String pair[] = vals[i].split(":"); assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0]))); assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0])))); } index++; } assertEquals(data.length, index); }
From source file:org.apache.mahout.avro.text.mapred.AvroDocumentProcessor.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(); if (args.length != 2) { System.err.println("Usage: wordcount <in> <out>"); return 0; }// w w w . j a v a 2 s. c om conf.setStrings("io.serializations", new String[] { WritableSerialization.class.getName(), AvroSpecificSerialization.class.getName(), AvroReflectSerialization.class.getName(), AvroGenericSerialization.class.getName() }); AvroComparator.setSchema(AvroDocument._SCHEMA); //TODO: must be done in mapper, reducer configure method. conf.setClass("mapred.output.key.comparator.class", AvroComparator.class, RawComparator.class); conf.setJarByClass(AvroDocumentProcessor.class); conf.setMapperClass(ProcessorMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputKeyClass(AvroDocument.class); conf.setOutputValueClass(NullWritable.class); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(AvroOutputFormat.class); AvroInputFormat.setAvroInputClass(conf, AvroDocument.class); AvroOutputFormat.setAvroOutputClass(conf, AvroDocument.class); Path input = new Path(args[0]); Path output = new Path(args[1]); FileSystem fs = FileSystem.get(conf); fs.delete(output, true); FileInputFormat.addInputPath(conf, input); FileOutputFormat.setOutputPath(conf, output); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); return job.isComplete() ? 0 : 1; }
From source file:org.apache.mahout.avro.text.mapred.AvroDocumentsWordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(); if (args.length != 2) { System.err.println("Usage: wordcount <in> <out>"); return 0; }/*w w w. ja va2 s . c o m*/ conf.setStrings("io.serializations", new String[] { WritableSerialization.class.getName(), AvroSpecificSerialization.class.getName(), AvroReflectSerialization.class.getName(), AvroGenericSerialization.class.getName() }); conf.setJarByClass(AvroDocumentsWordCount.class); conf.setMapperClass(TokenizerMapper.class); conf.setCombinerClass(IntSumReducer.class); conf.setReducerClass(IntSumReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); Path input = new Path(args[0]); Path output = new Path(args[1]); FileSystem fs = FileSystem.get(conf); fs.delete(output, true); AvroInputFormat.setAvroInputClass(conf, AvroDocument.class); FileInputFormat.addInputPath(conf, input); FileOutputFormat.setOutputPath(conf, output); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); return job.isSuccessful() ? 1 : 0; }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesWeightSummerDriver.class); conf.setJobName("TfIdf Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-termDocCount")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-wordFreq")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-featureCount")); Path outPath = new Path(output, "trainer-tfIdf"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); conf.setJarByClass(BayesTfIdfDriver.class); conf.setMapperClass(BayesTfIdfMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesTfIdfReducer.class); conf.setReducerClass(BayesTfIdfReducer.class); conf.setOutputFormat(BayesTfIdfOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); HadoopUtil.overwriteOutput(outPath); Path interimFile = new Path(output, "trainer-docCount/part-*"); Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile, conf);//from w w w . ja va2s . com DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelDocumentCounts)); String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts); log.info("Counts of documents in Each Label"); Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString); log.info("{}", c); conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString); log.info(params.print()); if (params.get("dataSource").equals("hbase")) { String tableName = output.toString(); HBaseConfiguration hc = new HBaseConfiguration(new Configuration()); HTableDescriptor ht = new HTableDescriptor(tableName); HColumnDescriptor hcd = new HColumnDescriptor(BayesConstants.HBASE_COLUMN_FAMILY + ':'); hcd.setBloomfilter(true); hcd.setInMemory(true); hcd.setMaxVersions(1); hcd.setBlockCacheEnabled(true); ht.addFamily(hcd); log.info("Connecting to hbase..."); HBaseAdmin hba = new HBaseAdmin(hc); log.info("Creating Table {}", output); if (hba.tableExists(tableName)) { hba.disableTable(tableName); hba.deleteTable(tableName); hba.majorCompact(".META."); } hba.createTable(ht); conf.set("output.table", tableName); } conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); }