List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features.java
/** * @param args/*from w w w .j a v a 2s .c o m*/ * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { int number_of_classes = 1; int number_of_features = 1; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Job job = new Job(conf, "NaiveBayesClassifierMapReduce_Continuous_Features"); job.setJarByClass(NaiveBayesClassifierMapReduce_Continuous_Features.class); conf = job.getConfiguration(); // This line is mandatory. job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatArrayWritable.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(MapArrayWritable.class); job.setMapperClass(NaiveBayesClassifierMap_Continuous_Features.class); job.setReducerClass(NaiveBayesClassifierReduce_Continuous_Features.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); number_of_classes = Integer.parseInt(args[2]); number_of_features = Integer.parseInt(args[3]); conf.setInt("number_of_classes", number_of_classes); conf.setInt("number_of_features", number_of_features); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); } }
From source file:com.mongodb.hadoop.util.MongoTool.java
License:Apache License
private int runMapReduceJob(final Configuration conf) throws IOException { final Job job = Job.getInstance(conf, getJobName()); /**// w w w . j a v a 2s . com * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf); LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI)); job.setMapperClass(mapper); Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MongoConfigUtil.getReducer(conf)); job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf)); job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MongoConfigUtil.isJobBackground(conf); try { if (background) { LOG.info("Setting up and running MapReduce job in background."); job.submit(); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); return job.waitForCompletion(true) ? 0 : 1; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java
License:Apache License
/** * Configures the MapReduce mapper for the job. * * @param job The Hadoop MR job./*from w w w . java 2 s .com*/ * @throws IOException If there is an error. */ protected void configureMapper(Job job) throws IOException { // Set the map class in the job configuration. final FijiMapper<?, ?, ?, ?> mapper = getMapper(); if (null == mapper) { throw new JobConfigurationException("Must specify a mapper"); } if (mapper instanceof Configurable) { ((Configurable) mapper).setConf(job.getConfiguration()); } job.setMapperClass(((Mapper<?, ?, ?, ?>) mapper).getClass()); // Set the map output key and map output value types in the job configuration. job.setMapOutputKeyClass(mapper.getOutputKeyClass()); job.setMapOutputValueClass(mapper.getOutputValueClass()); configureAvro(job, mapper); configureHTableInput(job, mapper); }
From source file:com.mozilla.hadoop.hbase.mapreduce.MultiScanTableMapReduceUtil.java
License:Apache License
/** * Use this before submitting a TableMap job. It will appropriately set up * the job.//from w ww . j ava 2 s . c o m * * @param table * The table name to read from. * @param scans * The scan instances with the columns, time range etc. * @param mapper * The mapper class to use. * @param outputKeyClass * The class of the output key. * @param outputValueClass * The class of the output value. * @param job * The current job to adjust. * @throws IOException * When setting up the details fails. */ @SuppressWarnings("rawtypes") public static void initMultiScanTableMapperJob(final String table, final Scan[] scans, final Class<? extends TableMapper> mapper, final Class<? extends WritableComparable> outputKeyClass, final Class<? extends Writable> outputValueClass, final Job job) throws IOException { job.setInputFormatClass(MultiScanTableInputFormat.class); if (outputValueClass != null) { job.setMapOutputValueClass(outputValueClass); } if (outputKeyClass != null) { job.setMapOutputKeyClass(outputKeyClass); } job.setMapperClass(mapper); job.getConfiguration().set(MultiScanTableInputFormat.INPUT_TABLE, table); job.getConfiguration().set(MultiScanTableInputFormat.SCANS, convertScanArrayToString(scans)); }
From source file:com.mozilla.socorro.hadoop.CrashCountToHbase.java
License:LGPL
/** * @param args//from w w w . j a va 2 s. c o m * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException { Job job = new Job(getConf()); job.setJobName(NAME); job.setJarByClass(CrashCountToHbase.class); FileInputFormat.addInputPath(job, new Path(args[0])); job.setMapperClass(CrashCountToHBaseMapper.class); job.setReducerClass(CrashCountToHBaseReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job; }
From source file:com.mozilla.socorro.hadoop.DumpSizeTrends.java
License:LGPL
/** * @param args/*from www . j a va 2s . c o m*/ * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException, ParseException { conf.set("mapred.child.java.opts", "-Xmx1024m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); Map<byte[], byte[]> columns = new HashMap<byte[], byte[]>(); columns.put(RAW_DATA_BYTES, DUMP_BYTES); columns.put(META_DATA_BYTES, JSON_BYTES); columns.put(PROCESSED_DATA_BYTES, JSON_BYTES); Job job = CrashReportJob.initJob(NAME, getConf(), DumpSizeTrends.class, DumpSizeTrendsMapper.class, null, DumpSizeTrendsReducer.class, columns, Text.class, Text.class, new Path(args[0])); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); return job; }
From source file:com.msd.gin.halyard.tools.HalyardBulkLoad.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_path(s)> <output_path> <table_name>"); return -1; }/* www . ja va 2s . com*/ TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]); job.setJarByClass(HalyardBulkLoad.class); job.setMapperClass(RDFMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(RioFileInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); Map<String, Integer> contextSplitsMap = new HashMap<>(); for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) { int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1)); StringTokenizer stk = new StringTokenizer(me.getValue(), ","); while (stk.hasMoreTokens()) { contextSplitsMap.put(stk.nextToken(), splits); } } try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
From source file:com.msd.gin.halyard.tools.HalyardBulkUpdate.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_file_with_SPARQL_queries> <output_path> <table_name>"); return -1; }/* w ww . j av a 2 s .co m*/ TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); getConf().setStrings(TABLE_NAME_PROPERTY, args[2]); Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]); NLineInputFormat.setNumLinesPerSplit(job, 1); job.setJarByClass(HalyardBulkUpdate.class); job.setMapperClass(SPARQLMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(NLineInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0, null)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Update Completed.."); return 0; } } return -1; }
From source file:com.msd.gin.halyard.tools.HalyardHiveLoad.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: hiveload -D" + RDF_MIME_TYPE_PROPERTY + "='application/ld+json' [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + HIVE_DATA_COLUMN_INDEX_PROPERTY + "=3] [-D" + BASE_URI_PROPERTY + "='http://my_base_uri/'] [-D" + HalyardBulkLoad.SPLIT_BITS_PROPERTY + "=8] [-D" + HalyardBulkLoad.DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + HalyardBulkLoad.OVERRIDE_CONTEXT_PROPERTY + "=true] <hive_table_name> <output_path> <hbase_table_name>"); return -1; }// w ww.j a va2s . c om TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); Job job = Job.getInstance(getConf(), "HalyardHiveLoad -> " + args[1] + " -> " + args[2]); int i = args[0].indexOf('.'); HCatInputFormat.setInput(job, i > 0 ? args[0].substring(0, i) : null, args[0].substring(i + 1)); job.setJarByClass(HalyardHiveLoad.class); job.setMapperClass(HiveMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(HCatInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); Map<String, Integer> contextSplitsMap = new HashMap<>(); for (Map.Entry<String, String> me : getConf().getValByRegex(HalyardBulkLoad.CONTEXT_SPLIT_REGEXP) .entrySet()) { int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1)); StringTokenizer stk = new StringTokenizer(me.getValue(), ","); while (stk.hasMoreTokens()) { contextSplitsMap.put(stk.nextToken(), splits); } } try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(HalyardBulkLoad.SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
From source file:com.msd.gin.halyard.tools.HalyardParallelExport.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(newOption("h", null, "Prints this help")); options.addOption(newOption("v", null, "Prints version")); options.addOption(newOption("s", "source_htable", "Source HBase table with Halyard RDF store")); options.addOption(newOption("q", "sparql_query", "SPARQL tuple or graph query with use of '" + PARALLEL_SPLIT_FUNCTION_URI + "' function")); options.addOption(newOption("t", "target_url", "file://<path>/<file_name>{0}.<ext> or hdfs://<path>/<file_name>{0}.<ext> or jdbc:<jdbc_connection>/<table_name>")); options.addOption(newOption("p", "property=value", "JDBC connection properties")); options.addOption(newOption("l", "driver_classpath", "JDBC driver classpath delimited by ':'")); options.addOption(newOption("c", "driver_class", "JDBC driver class name")); try {//from w w w . j av a 2 s . c o m CommandLine cmd = new PosixParser().parse(options, args); if (args.length == 0 || cmd.hasOption('h')) { printHelp(options); return -1; } if (cmd.hasOption('v')) { Properties p = new Properties(); try (InputStream in = HalyardExport.class .getResourceAsStream("/META-INF/maven/com.msd.gin.halyard/hbasesail/pom.properties")) { if (in != null) p.load(in); } System.out.println("Halyard Parallel Export version " + p.getProperty("version", "unknown")); return 0; } if (!cmd.getArgList().isEmpty()) throw new ExportException("Unknown arguments: " + cmd.getArgList().toString()); for (char c : "sqt".toCharArray()) { if (!cmd.hasOption(c)) throw new ExportException("Missing mandatory option: " + c); } for (char c : "sqtlc".toCharArray()) { String s[] = cmd.getOptionValues(c); if (s != null && s.length > 1) throw new ExportException("Multiple values for option: " + c); } String source = cmd.getOptionValue('s'); String query = cmd.getOptionValue('q'); if (!query.contains(PARALLEL_SPLIT_FUNCTION_NAME)) { throw new ExportException("Parallel export SPARQL query must contain '" + PARALLEL_SPLIT_FUNCTION_URI + "' function."); } String target = cmd.getOptionValue('t'); if ((target.startsWith("file:") || target.startsWith("hdfs:")) && !target.contains("{0}")) { throw new ExportException( "Parallel export file target must contain '{0}' counter in the file path or name."); } getConf().set(SOURCE, source); getConf().set(QUERY, query); getConf().set(TARGET, target); String driver = cmd.getOptionValue('c'); if (driver != null) { getConf().set(JDBC_DRIVER, driver); } String props[] = cmd.getOptionValues('p'); if (props != null) { for (int i = 0; i < props.length; i++) { props[i] = Base64.encodeBase64String(props[i].getBytes(UTF8)); } getConf().setStrings(JDBC_PROPERTIES, props); } TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class, HTable.class, HBaseConfiguration.class, AuthenticationProtos.class, Trace.class); HBaseConfiguration.addHbaseResources(getConf()); Job job = Job.getInstance(getConf(), "HalyardParallelExport " + source + " -> " + target); String cp = cmd.getOptionValue('l'); if (cp != null) { String jars[] = cp.split(":"); for (int i = 0; i < jars.length; i++) { File f = new File(jars[i]); if (!f.isFile()) throw new ExportException("Invalid JDBC driver classpath element: " + jars[i]); job.addFileToClassPath(new Path(f.toURI())); jars[i] = f.getName(); } job.getConfiguration().setStrings(JDBC_CLASSPATH, jars); } job.setJarByClass(HalyardParallelExport.class); job.setMaxMapAttempts(1); job.setMapperClass(ParallelExportMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Void.class); job.setNumReduceTasks(0); job.setInputFormatClass(IndexedInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { LOG.info("Parallel Export Completed.."); return 0; } return -1; } catch (RuntimeException exp) { System.out.println(exp.getMessage()); printHelp(options); throw exp; } }