List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:co.nubetech.hiho.mapreduce.lib.db.apache.DBOutputFormat.java
License:Apache License
private static DBConfiguration setOutput(Job job, JobConf jobConf, String tableName) throws IOException { job.setOutputFormatClass(DBOutputFormat.class); jobConf.setReduceSpeculativeExecution(false); DBConfiguration dbConf = new DBConfiguration(job.getConfiguration()); dbConf.setOutputTableName(tableName); return dbConf; }
From source file:co.nubetech.hiho.mapreduce.lib.db.GenericDBOutputFormat.java
License:Apache License
public static void setOutput(Job job, String tableName, String columnNames) throws IOException { job.setOutputFormatClass(GenericDBOutputFormat.class); DBConfiguration dbConf = new DBConfiguration(job.getConfiguration()); dbConf.setOutputTableName(tableName); dbConf.setOutputFieldNames(columnNames); String dbDriver = job.getConfiguration().get(DBConfiguration.DRIVER_CLASS_PROPERTY); String connString = job.getConfiguration().get(DBConfiguration.URL_PROPERTY); String username = job.getConfiguration().get(DBConfiguration.USERNAME_PROPERTY); String password = job.getConfiguration().get(DBConfiguration.PASSWORD_PROPERTY); Connection conn;/* w w w . j a va2 s. c om*/ PreparedStatement stmt; try { Class.forName(dbDriver).newInstance(); conn = DriverManager.getConnection(connString, username, password); String query = "select " + columnNames + " from " + tableName; stmt = conn.prepareStatement(query); ResultSetMetaData meta = stmt.getMetaData(); ArrayList<ColumnInfo> columnInfo = populateColumnInfo(meta); String jsonString = getJsonStringOfColumnInfo(columnInfo); job.getConfiguration().set(HIHOConf.COLUMN_INFO, jsonString); logger.debug("columnInfo is: " + job.getConfiguration().get(HIHOConf.COLUMN_INFO)); stmt.close(); conn.close(); } catch (Exception e) { e.printStackTrace(); throw new IOException(e); } }
From source file:co.nubetech.hiho.merge.MergeJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { populateConfiguration(args);// w w w . jav a 2 s .c o m try { checkMandatoryConfs(); } catch (HIHOException e1) { e1.printStackTrace(); throw new Exception(e1); } Class inputFormatClass = Class.forName(inputFormat); Class outputFormatClass = Class.forName(outputFormat); Class inputKeyClass = Class.forName(inputKeyClassName); Class inputValueClass = Class.forName(inputValueClassName); Configuration conf = getConf(); conf.set(HIHOConf.MERGE_OLD_PATH, oldPath); conf.set(HIHOConf.MERGE_NEW_PATH, newPath); Job job = new Job(conf); job.setJobName("Merge job"); job.setJarByClass(MergeJob.class); if (mergeBy.equals("key")) { job.setMapperClass(MergeKeyMapper.class); job.setReducerClass(MergeKeyReducer.class); } else if (mergeBy.equals("value")) { job.setMapperClass(MergeValueMapper.class); job.setReducerClass(MergeValueReducer.class); } job.setInputFormatClass(inputFormatClass); DelimitedTextInputFormat.setProperties(job, delimiter, column); job.setMapOutputKeyClass(HihoTuple.class); job.setMapOutputValueClass(HihoValue.class); job.setOutputKeyClass(inputKeyClass); job.setOutputValueClass(inputValueClass); FileInputFormat.setInputPaths(job, oldPath + "," + newPath); job.setOutputFormatClass(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(outputPath)); try { logger.debug("Output format class is " + job.getOutputFormatClass()); logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName()); job.waitForCompletion(false); if (job.isComplete()) { Counters counters = job.getCounters(); totalRecordsOld = counters.findCounter(MergeRecordCounter.TOTAL_RECORDS_OLD).getValue(); totalRecordsNew = counters.findCounter(MergeRecordCounter.TOTAL_RECORDS_NEW).getValue(); badRecords = counters.findCounter(MergeRecordCounter.BAD_RECORD).getValue(); output = counters.findCounter(MergeRecordCounter.OUTPUT).getValue(); logger.info("Total old records read are: " + totalRecordsOld); logger.info("Total new records read are: " + totalRecordsNew); logger.info("Bad Records are: " + badRecords); logger.info("Output records are: " + output); } } catch (Exception e) { e.printStackTrace(); } return 0; }
From source file:co.nubetech.hiho.similarity.ngram.NGramJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); populateConfiguration(args);//from w w w. j a va 2s . c om try { checkMandatoryConfs(); } catch (HIHOException e1) { e1.printStackTrace(); throw new Exception(e1); } Job job = new Job(conf); job.setJobName("NGram job"); job.setJarByClass(NGramJob.class); Class inputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat"); Class outputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"); // org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat // org.apache.hadoop.mapreduce.lib.output.TextOutputFormat Class inputKeyClass = Class.forName("org.apache.hadoop.io.Text"); Class inputValueClass = Class.forName("org.apache.hadoop.io.Text"); Class outputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair"); Class outputValueClass = Class.forName("org.apache.hadoop.io.IntWritable"); job.setMapperClass(NGramMapper.class); job.setReducerClass(NGramReducer.class); job.setInputFormatClass(inputFormatClass); job.setMapOutputKeyClass(inputKeyClass); job.setMapOutputValueClass(inputValueClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); job.setOutputFormatClass(outputFormatClass); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path("outputOfNGramJob")); int ret = 0; try { ret = job.waitForCompletion(true) ? 0 : 1; } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:co.nubetech.hiho.similarity.ngram.ScoreJob.java
License:Apache License
@Override public int run(String[] arg0) throws Exception { Configuration conf = getConf(); Job job = new Job(conf); job.setJobName("Score job"); job.setJarByClass(ScoreJob.class); Class inputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat"); Class outputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"); // org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat // org.apache.hadoop.mapreduce.lib.output.TextOutputFormat Class inputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair"); Class inputValueClass = Class.forName("org.apache.hadoop.io.IntWritable"); Class outputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair"); Class outputValueClass = Class.forName("org.apache.hadoop.io.LongWritable"); job.setMapperClass(ScoreMapper.class); job.setReducerClass(ScoreReducer.class); job.setInputFormatClass(inputFormatClass); job.setMapOutputKeyClass(inputKeyClass); job.setMapOutputValueClass(inputValueClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); job.setOutputFormatClass(outputFormatClass); FileInputFormat.setInputPaths(job, "outputOfNGramJob"); FileOutputFormat.setOutputPath(job, new Path("outputOfScoreJob")); int ret = 0;//from w w w. j a v a 2s . c o m try { ret = job.waitForCompletion(true) ? 0 : 1; } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:com.abel.hwfs.custom.output.SetSizeDBOutputFormat.java
License:Apache License
private static MyDBConfiguration setOutput(Job job, String tableName) throws IOException { job.setOutputFormatClass(SetSizeDBOutputFormat.class); job.setReduceSpeculativeExecution(false); MyDBConfiguration dbConf = new MyDBConfiguration(job.getConfiguration()); dbConf.setOutputTableName(tableName); return dbConf; }
From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(this.getConf()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCount.WordCountMapper.class); job.setCombinerClass(WordCount.WordCountCombiner.class); job.setReducerClass(WordCount.WordCountReducer.class); // clone the articles table ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]); Connector conn = inst.getConnector(args[2], new PasswordToken(args[3])); conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE, true, Collections.EMPTY_MAP, Collections.EMPTY_SET); // take cloned table offline, waiting until the operation is complete boolean wait = true; conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait); ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]); // input/* w ww. j a v a 2 s. c om*/ job.setInputFormatClass(AccumuloInputFormat.class); AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE); List<Pair<Text, Text>> columns = new ArrayList<>(); columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text(""))); AccumuloInputFormat.fetchColumns(job, columns); AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); // configure to use underlying RFiles AccumuloInputFormat.setOfflineTableScan(job, true); // output job.setOutputFormatClass(AccumuloOutputFormat.class); BatchWriterConfig bwConfig = new BatchWriterConfig(); AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig); AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE); AccumuloOutputFormat.setCreateTables(job, true); job.setJarByClass(WordCount.class); job.waitForCompletion(true); //job.submit(); return 0; }
From source file:com.accumulobook.advanced.mapreduce.WordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(new Configuration()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountCombiner.class); job.setReducerClass(WordCountReducer.class); // input/*w ww .jav a2 s .com*/ job.setInputFormatClass(AccumuloInputFormat.class); ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]); AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE); List<Pair<Text, Text>> columns = new ArrayList<>(); columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text(""))); AccumuloInputFormat.fetchColumns(job, columns); AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); // output job.setOutputFormatClass(AccumuloOutputFormat.class); BatchWriterConfig config = new BatchWriterConfig(); AccumuloOutputFormat.setBatchWriterOptions(job, config); AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE); AccumuloOutputFormat.setCreateTables(job, true); job.setJarByClass(WordCount.class); job.submit(); return 0; }
From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java
License:Apache License
/** * Configure a MapReduce Job to perform an incremental load into the given * table. This// w w w . j ava2s . c om * <ul> * <li>Inspects the table to configure a total order partitioner</li> * <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li> * <li>Sets the number of reduce tasks to match the current number of regions</li> * <li>Sets the output key/value class to match HFileOutputFormat's requirements</li> * <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or * PutSortReducer)</li> * </ul> * The user should be sure to set the map output value class to either KeyValue or Put before * running this function. */ public static void configureIncrementalLoad(Job job, HTable table) throws IOException { Configuration conf = job.getConfiguration(); Class<? extends Partitioner> topClass; try { topClass = getTotalOrderPartitionerClass(); } catch (ClassNotFoundException e) { throw new IOException("Failed getting TotalOrderPartitioner", e); } //partition job.setPartitionerClass(topClass); //Set the key class for the job output data job.setOutputKeyClass(ImmutableBytesWritable.class); //Set the value class for job outputs job.setOutputValueClass(KeyValue.class); //outputformatHfile job.setOutputFormatClass(HFileOutputFormat2.class); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(SingleColumnReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } LOG.info("Looking up current regions for table " + table); //?regionstarkey List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); //?region?reduce? job.setNumReduceTasks(startKeys.size()); Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + UUID.randomUUID()); LOG.info("Writing partition information to " + partitionsPath); FileSystem fs = partitionsPath.getFileSystem(conf); writePartitions(conf, partitionsPath, startKeys); partitionsPath.makeQualified(fs); URI cacheUri; try { // Below we make explicit reference to the bundled TOP. Its cheating. // We are assume the define in the hbase bundled TOP is as it is in // hadoop (whether 0.20 or 0.22, etc.) /* cacheUri = new URI(partitionsPath.toString() + "#" + org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH); */ cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.addCacheFile(cacheUri, conf); DistributedCache.createSymlink(conf); // Set compression algorithms based on column families configureCompression(table, conf); TableMapReduceUtil.addDependencyJars(job); LOG.info("Incremental table output configured."); }
From source file:com.alexholmes.hadooputils.combine.avro.mapreduce.CombineAvroKeyValueInputFormatTest.java
License:Apache License
@Test public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException { // Create a test input file. File inputFile = createInputFile(); // Configure the job input. Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath())); job.setInputFormatClass(CombineAvroKeyValueInputFormat.class); AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT)); AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING)); // Configure a mapper. job.setMapperClass(IndexMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // Configure a reducer. job.setReducerClass(IndexReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT))); // Configure the output format. job.setOutputFormatClass(AvroKeyValueOutputFormat.class); Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index"); FileOutputFormat.setOutputPath(job, outputPath); // Run the job. assertTrue(job.waitForCompletion(true)); // Verify that the output Avro container file as the expected data. File avroFile = new File(outputPath.toString(), "part-r-00000.avro"); DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>(AvroKeyValue .getSchema(Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT)))); DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>( avroFileReader.next());/*from ww w. jav a 2s . c o m*/ assertNotNull(appleRecord.get()); assertEquals("apple", appleRecord.getKey().toString()); List<Integer> appleDocs = appleRecord.getValue(); assertEquals(3, appleDocs.size()); assertTrue(appleDocs.contains(1)); assertTrue(appleDocs.contains(2)); assertTrue(appleDocs.contains(3)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>( avroFileReader.next()); assertNotNull(bananaRecord.get()); assertEquals("banana", bananaRecord.getKey().toString()); List<Integer> bananaDocs = bananaRecord.getValue(); assertEquals(2, bananaDocs.size()); assertTrue(bananaDocs.contains(1)); assertTrue(bananaDocs.contains(2)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>( avroFileReader.next()); assertEquals("carrot", carrotRecord.getKey().toString()); List<Integer> carrotDocs = carrotRecord.getValue(); assertEquals(1, carrotDocs.size()); assertTrue(carrotDocs.contains(1)); assertFalse(avroFileReader.hasNext()); avroFileReader.close(); }