List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java
License:Apache License
/** * setInput() takes a custom query and a separate "bounding query" to use * instead of the custom "count query" used by DBInputFormat. */// ww w .j a v a 2s .c o m public static void setInput(JobConf job, String inputQuery, String inputBoundingQuery, ArrayList params) throws IOException { DBInputFormat.setInput(job, GenericDBWritable.class, inputQuery, ""); if (inputBoundingQuery != null) { job.set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery); } if (params != null) { DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(job, ArrayList.class); job.set(HIHOConf.QUERY_PARAMS, stringifier.toString(params)); logger.debug("Converted params and saved them into config"); } job.setInputFormat(DBQueryInputFormat.class); }
From source file:co.nubetech.hiho.job.DBQueryInputJob.java
License:Apache License
public void runJobs(Configuration conf, int jobCounter) throws IOException { try {/*from w w w . j a v a 2 s. c o m*/ checkMandatoryConfs(conf); } catch (HIHOException e1) { e1.printStackTrace(); throw new IOException(e1); } Job job = new Job(conf); for (Entry<String, String> entry : conf) { logger.warn("key, value " + entry.getKey() + "=" + entry.getValue()); } // logger.debug("Number of maps " + // conf.getInt("mapred.map.tasks", 1)); // conf.setInt(JobContext.NUM_MAPS, // conf.getInt("mapreduce.job.maps", 1)); // job.getConfiguration().setInt("mapred.map.tasks", 4); job.getConfiguration().setInt(MRJobConfig.NUM_MAPS, conf.getInt(HIHOConf.NUMBER_MAPPERS, 1)); logger.warn("Number of maps " + conf.getInt(MRJobConfig.NUM_MAPS, 1)); job.setJobName("Import job"); job.setJarByClass(DBQueryInputJob.class); String strategy = conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY); OutputStrategyEnum os = OutputStrategyEnum.value(strategy); if (os == null) { throw new IllegalArgumentException("Wrong value of output strategy. Please correct"); } if (os != OutputStrategyEnum.AVRO) { switch (os) { case DUMP: { // job.setMapperClass(DBImportMapper.class); break; } /* * case AVRO: { job.setMapperClass(DBInputAvroMapper.class); // * need avro in cp // job.setJarByClass(Schema.class); // need * jackson which is needed by avro - ugly! // * job.setJarByClass(ObjectMapper.class); * job.setMapOutputKeyClass(NullWritable.class); * job.setMapOutputValueClass(AvroValue.class); * job.setOutputKeyClass(NullWritable.class); * job.setOutputValueClass(AvroValue.class); * job.setOutputFormatClass(AvroOutputFormat.class); * * AvroOutputFormat.setOutputPath(job, new * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; } */ case DELIMITED: { job.setMapperClass(DBInputDelimMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class); NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); } case JSON: { // job.setMapperClass(DBImportJsonMapper.class); // job.setJarByClass(ObjectMapper.class); break; } default: { job.setMapperClass(DBInputDelimMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class); NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; } } String inputQuery = conf.get(DBConfiguration.INPUT_QUERY); String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY); logger.debug("About to set the params"); DBQueryInputFormat.setInput(job, inputQuery, inputBoundingQuery, params); logger.debug("Set the params"); job.setNumReduceTasks(0); try { // job.setJarByClass(Class.forName(conf.get( // org.apache.hadoop.mapred.lib.db.DBConfiguration.DRIVER_CLASS_PROPERTY))); logger.debug("OUTPUT format class is " + job.getOutputFormatClass()); /* * org.apache.hadoop.mapreduce.OutputFormat<?, ?> output = * ReflectionUtils.newInstance(job.getOutputFormatClass(), * job.getConfiguration()); output.checkOutputSpecs(job); */ logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName()); job.waitForCompletion(false); if (conf.get(HIHOConf.INPUT_OUTPUT_LOADTO) != null) { generateHiveScript(conf, job, jobCounter); generatePigScript(conf, job); } } /* * catch (HIHOException h) { h.printStackTrace(); } */ catch (Exception e) { e.printStackTrace(); } catch (HIHOException e) { e.printStackTrace(); } } // avro to be handled differently, thanks to all the incompatibilities // in the apis. else { String inputQuery = conf.get(DBConfiguration.INPUT_QUERY); String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY); logger.debug("About to set the params"); // co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(job, // inputQuery, inputBoundingQuery, params); logger.debug("Set the params"); JobConf jobConf = new JobConf(conf); try { GenericDBWritable queryWritable = getDBWritable(jobConf); Schema pair = DBMapper.getPairSchema(queryWritable.getColumns()); AvroJob.setMapOutputSchema(jobConf, pair); GenericRecordAvroOutputFormat.setOutputPath(jobConf, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(jobConf, inputQuery, inputBoundingQuery, params); jobConf.setInputFormat(co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.class); jobConf.setMapperClass(DBInputAvroMapper.class); jobConf.setMapOutputKeyClass(NullWritable.class); jobConf.setMapOutputValueClass(AvroValue.class); jobConf.setOutputKeyClass(NullWritable.class); jobConf.setOutputValueClass(Text.class); jobConf.setOutputFormat(GenericRecordAvroOutputFormat.class); jobConf.setJarByClass(DBQueryInputJob.class); jobConf.setStrings("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization,org.apache.avro.mapred.AvroSerialization"); jobConf.setNumReduceTasks(0); /* * jobConf.setOutputFormat(org.apache.hadoop.mapred. * SequenceFileOutputFormat.class); * org.apache.hadoop.mapred.SequenceFileOutputFormat * .setOutputPath(jobConf, new * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); */ JobClient.runJob(jobConf); } catch (Throwable e) { e.printStackTrace(); } } }
From source file:com.acme.extensions.data.SeedingHadoopAdapter.java
License:Apache License
@Override protected void set(JobConf conf) throws Exception { super.set(conf); // set the input format to this adapter conf.setInputFormat(this.getClass()); conf.set("seed", String.valueOf(seed)); }
From source file:com.aerospike.hadoop.examples.wordcountinput.WordCountInput.java
License:Apache License
public int run(final String[] args) throws Exception { log.info("run starting"); final Configuration conf = getConf(); JobConf job = new JobConf(conf, WordCountInput.class); job.setJobName("AerospikeWordCountInput"); job.setInputFormat(AerospikeInputFormat.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[0])); JobClient.runJob(job);/* w ww .j a va 2 s.c o m*/ log.info("finished"); return 0; }
From source file:com.alexholmes.hadooputils.combine.avro.mapred.CombineAvroInputFormatTest.java
License:Apache License
@SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile();// w w w .ja va 2s . co m job.setJobName("wordcount"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); job.setInputFormat(CombineAvroInputFormat.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(); }
From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileJob.java
License:Apache License
/** * The driver for the MapReduce job./* w w w. j a va 2 s. c om*/ * * @param conf configuration * @param inputDirAsString input directory in CSV-form * @param outputDirAsString output directory * @return true if the job completed successfully * @throws java.io.IOException if something went wrong * @throws java.net.URISyntaxException if a URI wasn't correctly formed */ public boolean runJob(final Configuration conf, final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(conf); job.setJarByClass(CombineSequenceFileJob.class); job.setJobName("seqfilecombiner"); job.setNumReduceTasks(0); job.setMapperClass(IdentityMapper.class); job.setInputFormat(CombineSequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, inputDirAsString); FileOutputFormat.setOutputPath(job, new Path(outputDirAsString)); Date startTime = new Date(); System.out.println("Job started: " + startTime); RunningJob jobResult = JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds."); return jobResult.isSuccessful(); }
From source file:com.alexholmes.hadooputils.sort.Sort.java
License:Apache License
/** * The driver for the sort MapReduce job. * * @param jobConf sort configuration * @param numMapTasks number of map tasks * @param numReduceTasks number of reduce tasks * @param sampler sampler, if required * @param codecClass the compression codec for compressing final outputs * @param mapCodecClass the compression codec for compressing intermediary map outputs * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes * for the job output files * @param inputDirAsString input directory in CSV-form * @param outputDirAsString output directory * @return true if the job completed successfully * @throws IOException if something went wrong * @throws URISyntaxException if a URI wasn't correctly formed *///from w w w .j av a2 s . co m public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks, final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass, final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes, final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException { jobConf.setJarByClass(Sort.class); jobConf.setJobName("sorter"); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (numMapTasks != null) { jobConf.setNumMapTasks(numMapTasks); } if (numReduceTasks != null) { jobConf.setNumReduceTasks(numReduceTasks); } else { int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sort.reduces_per_host"); if (sortReduces != null) { numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(numReduces); } jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(SortReduce.class); jobConf.setInputFormat(SortInputFormat.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); if (mapCodecClass != null) { jobConf.setMapOutputCompressorClass(mapCodecClass); } if (codecClass != null) { jobConf.setBoolean("mapred.output.compress", true); jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class); } FileInputFormat.setInputPaths(jobConf, inputDirAsString); FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString)); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; FileSystem fileSystem = FileSystem.get(jobConf); if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) { inputDir = inputDir.getParent(); } inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + jobConf.getNumReduceTasks() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds."); if (jobResult.isSuccessful()) { if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) { new LzoIndexer(jobConf).index(new Path(outputDirAsString)); } return true; } return false; }
From source file:com.benchmark.mapred.Join.java
License:Apache License
/** * The main driver for sort program./*from w w w . j av a 2 s . co m*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("join"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_maps = cluster.getTaskTrackers() * jobConf.getInt("test.sort.maps_per_host", 10); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = TupleWritable.class; String op = "inner"; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { num_maps = Integer.parseInt(args[++i]); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-joinOp".equals(args[i])) { op = args[++i]; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumMapTasks(num_maps); jobConf.setNumReduceTasks(num_reduces); if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.remove(otherArgs.size() - 1))); List<Path> plist = new ArrayList<Path>(otherArgs.size()); for (String s : otherArgs) { plist.add(new Path(s)); } jobConf.setInputFormat(CompositeInputFormat.class); jobConf.set("mapred.join.expr", CompositeInputFormat.compose(op, inputFormatClass, plist.toArray(new Path[0]))); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.benchmark.mapred.MultiFileWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();// w w w. ja v a 2s.c o m return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); //set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(LongWritable.class); //use the defined mapper job.setMapperClass(MapClass.class); //use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
From source file:com.benchmark.mapred.PiEstimator.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi/*from www . ja v a 2s.c o m*/ */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { //setup job conf jobConf.setJobName(PiEstimator.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); //setup input/output directories //final Path inDir = new Path(TMP_DIR, "in"); final Path inDir = new Path("/home/hadoop1/tmp_dir", "in"); System.out.println("inDir =" + inDir.toString()); //final Path outDir = new Path(TMP_DIR, "out"); final Path outDir = new Path("/home/hadoop1/tmp_dir", "out"); System.out.println("outDir =" + outDir.toString()); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } //start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + duration + " seconds"); //read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } //compute estimated value return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }