List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobConf conf) throws IOException
From source file:com.savy3.nonequijoin.MapOutputSampler.java
License:Apache License
/** * Driver for InputSampler from the command line. Configures a JobConf * instance and calls {@link #writePartitionFile}. *///from w w w . j ava 2 s .c o m public int run(String[] args) throws Exception { Job job = new Job(getConf()); ArrayList<String> otherArgs = new ArrayList<String>(); Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-inFormat".equals(args[i])) { job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class)); } else if ("-keyClass".equals(args[i])) { job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class)); } else if ("-splitSample".equals(args[i])) { int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new SplitSampler<K, V>(numSamples, maxSplits); } else if ("-splitRandom".equals(args[i])) { System.out.println("Random sampling"); double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else if ("-splitInterval".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new IntervalSampler<K, V>(pcnt, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (job.getNumReduceTasks() <= 1) { System.err.println("Sampler requires more than one reducer"); return printUsage(); } if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } if (null == sampler) { sampler = new RandomSampler<K, V>(0.1, 10000, 10); } System.out.println("before paths"); Path outf = new Path(otherArgs.remove(otherArgs.size() - 1)); TotalOrderPartitioner.setPartitionFile(getConf(), outf); for (String s : otherArgs) { FileInputFormat.addInputPath(job, new Path(s)); } MapOutputSampler.<K, V>writePartitionFile(job, sampler); return 0; }
From source file:com.sematext.hbase.hut.TestHBaseHut.java
License:Apache License
public static void processUpdatesWithMrJob(Configuration configuration, HTable hTable, int mapBufferSize, long mapBufferSizeInBytes, int minRecordsToCompact, UpdateProcessor up) throws IOException, InterruptedException, ClassNotFoundException { System.out.println("Table contents BEFORE processing with MR job:"); System.out.println(DebugUtil.getContentAsText(hTable)); configuration.set("hut.mr.buffer.size", String.valueOf(mapBufferSize)); configuration.set("hut.mr.buffer.size.bytes", String.valueOf(mapBufferSizeInBytes)); configuration.set("hut.processor.minRecordsToCompact", String.valueOf(minRecordsToCompact)); Job job = new Job(configuration); UpdatesProcessingMrJob.initJob(Bytes.toString(hTable.getTableName()), new Scan(), up, job); boolean success = job.waitForCompletion(true); Assert.assertTrue(success);/*from w w w . ja va 2 s. c o m*/ System.out.println("Table contents AFTER processing with MR job:"); System.out.println(DebugUtil.getContentAsText(hTable)); }
From source file:com.sematext.hbase.hut.TestHBaseHut.java
License:Apache License
@Test public void testPartialUpdatesProcessingMrJob() throws IOException, InterruptedException, ClassNotFoundException { try {//from ww w. jav a 2 s . c o m // Writing data for (int i = 0; i < 15; i++) { byte[] company; if (i % 2 == 0) { company = FORD; } else { company = CHRYSLER; } recordSale(hTable, company, i); Thread.sleep(200); } recordSale(hTable, TOYOTA, 23); System.out.println(DebugUtil.getContentAsText(hTable)); Configuration configuration = testingUtility.getConfiguration(); configuration.set("hut.mr.buffer.size", String.valueOf(10)); configuration.set("hut.processor.tsMod", String.valueOf(300)); Job job = new Job(configuration); UpdatesProcessingMrJob.initJob(TABLE_NAME, new Scan(), new StockSaleUpdateProcessor(), job); boolean success = job.waitForCompletion(true); Assert.assertTrue(success); // TODO: add code verification of proper partial compaction instead of manually observing in output System.out.println(DebugUtil.getContentAsText(hTable)); StockSaleUpdateProcessor updateProcessor = new StockSaleUpdateProcessor(); verifyLastSalesWithCompation(hTable, updateProcessor, FORD, new int[] { 14, 12, 10, 8, 6 }); verifyLastSalesWithCompation(hTable, updateProcessor, CHRYSLER, new int[] { 13, 11, 9, 7, 5 }); verifyLastSalesWithCompation(hTable, updateProcessor, TOYOTA, new int[] { 23 }); } finally { // TODO: do we really need try/finally block here? testingUtility.shutdownMiniMapReduceCluster(); } }
From source file:com.sequenceiq.yarntest.mr.QuasiMonteCarlo.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi// w ww .j av a 2s . c om */ public static JobID submitPiEstimationMRApp(String jobName, int numMaps, long numPoints, Path tmpDir, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf); //setup job conf job.setJobName(jobName); job.setJarByClass(QuasiMonteCarlo.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(LongWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(QmcMapper.class); job.setReducerClass(QmcReducer.class); job.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. job.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(tmpDir, "in"); final Path outDir = new Path(tmpDir, "out"); FileInputFormat.setInputPaths(job, inDir); FileOutputFormat.setOutputPath(job, outDir); final FileSystem fs = FileSystem.get(conf); if (fs.exists(tmpDir)) { fs.delete(tmpDir, true); // throw new IOException("Tmp directory " + fs.makeQualified(tmpDir) // + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } // try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } //start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); job.submit(); // final double duration = (System.currentTimeMillis() - startTime)/1000.0; // System.out.println("Job Finished in " + duration + " seconds"); return job.getJobID(); // } finally { // fs.delete(tmpDir, true); // } }
From source file:com.shmsoft.dmass.main.MRFreeEedProcess.java
License:Apache License
@Override public int run(String[] args) throws Exception { // inventory dir holds all package (zip) files resulting from stage String projectFileName = args[0]; String outputPath = args[1];//from ww w .j a v a 2s . c o m logger.info("Running Hadoop job"); logger.info("Input project file = " + projectFileName); logger.info("Output path = " + outputPath); // Hadoop configuration class Configuration configuration = getConf(); // No speculative execution! Do not process the same file twice configuration.set("mapred.reduce.tasks.speculative.execution", "false"); // TODO even in local mode, the first argument should not be the inventory // but write a complete project file instead Project project = Project.getProject(); if (project == null || project.isEmpty()) { // configure Hadoop input files System.out.println("Reading project file " + projectFileName); project = new Project().loadFromFile(new File(projectFileName)); Project.setProject(project); } project.setProperty(ParameterProcessing.OUTPUT_DIR_HADOOP, outputPath); // send complete project information to all mappers and reducers configuration.set(ParameterProcessing.PROJECT, project.toString()); Settings.load(); configuration.set(ParameterProcessing.SETTINGS_STR, Settings.getSettings().toString()); configuration.set(ParameterProcessing.METADATA_FILE, Files.toString(new File(ColumnMetadata.metadataNamesFile), Charset.defaultCharset())); Job job = new Job(configuration); job.setJarByClass(MRFreeEedProcess.class); job.setJobName("MRFreeEedProcess"); // Hadoop processes key-value pairs job.setOutputKeyClass(MD5Hash.class); job.setOutputValueClass(MapWritable.class); // set map and reduce classes job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); // Hadoop TextInputFormat class job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // String delim = "\u0001"; // configuration.set("mapred.textoutputformat.separator", delim); // configuration.set("mapreduce.output.textoutputformat.separator", delim); logger.debug("project.isEnvHadoop() = {} ", project.isEnvHadoop()); String inputPath = projectFileName; if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) { inputPath = formInputPath(project); } logger.debug("Ready to run, inputPath = {}, outputPath = {}", inputPath, outputPath); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); SHMcloudLogging.init(false); if (Settings.getSettings().isHadoopDebug()) { if (new File(outputPath).exists()) { Util.deleteDirectory(new File(outputPath)); } } SolrIndex.getInstance().init(); boolean success = job.waitForCompletion(true); if (project.isEnvHadoop() && project.isFsS3()) { transferResultsToS3(outputPath); } SolrIndex.getInstance().destroy(); return success ? 0 : 1; }
From source file:com.shopzilla.hadoop.mapreduce.MiniMRClusterContextMRTest.java
License:Apache License
@Test public void testWordCount() throws Exception { Path input = new Path("/user/test/keywords_data"); Path output = new Path("/user/test/word_count"); Job job = new Job(configuration); job.setJobName("Word Count Test"); job.setMapperClass(WordCountMapper.class); job.setReducerClass(SumReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1);/*from w w w. j a v a 2 s. c o m*/ FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); assertTrue("All files from /data classpath directory should have been copied into HDFS", miniMRClusterContext.getFileSystem().exists(input)); job.waitForCompletion(true); assertTrue("Output file should have been created", miniMRClusterContext.getFileSystem().exists(output)); final LinkedList<String> expectedLines = new LinkedList<String>(); expectedLines.add("goodbye\t1"); expectedLines.add("hello\t1"); expectedLines.add("world\t2"); miniMRClusterContext.processData(output, new Function<String, Void>() { @Override public Void apply(String line) { assertEquals(expectedLines.pop(), line); return null; } }); assertEquals(0, expectedLines.size()); }
From source file:com.splout.db.benchmark.IdentityJob.java
License:Apache License
@Override public int run(String[] params) throws Exception { // Validate params etc JCommander jComm = new JCommander(this); jComm.setProgramName("Identity Job"); try {/*from w w w .j ava 2 s .c om*/ jComm.parse(params); } catch (ParameterException e) { System.err.println(e.getMessage()); jComm.usage(); System.exit(-1); } Path outP = new Path(outputPath); HadoopUtils.deleteIfExists(FileSystem.get(conf), outP); if (pangoolSchema == null) { // Use plain Hadoop API Job job = new Job(conf); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outP); job.waitForCompletion(true); } else { if (groupBy == null) { System.err.println("If pangoolSchema is used, groupBy must also be used."); jComm.usage(); System.exit(-1); } Schema schema = new Schema("sch", Fields.parse(pangoolSchema)); Path inputP = new Path(inputPath); // Use Pangool API - parse CSV, etc TupleMRBuilder builder = new TupleMRBuilder(conf); TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false, separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null); TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0), quotes.charAt(0), escape.charAt(0)); builder.addIntermediateSchema(schema); builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper()); builder.setGroupByFields(groupBy); builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class); builder.setTupleReducer(new IdentityTupleReducer()); builder.setJarByClass(this.getClass()); builder.createJob().waitForCompletion(true); } return 1; }
From source file:com.splout.db.hadoop.SchemaSampler.java
License:Apache License
public static Schema sample(Configuration conf, Path input, InputFormat<ITuple, NullWritable> inputFormat) throws IOException, InterruptedException { Schema schema = null;//from www . j a va2 s.co m // sample schema from input path given the provided InputFormat @SuppressWarnings("deprecation") Job job = new Job(conf); FileInputFormat.setInputPaths(job, input); // get first inputSplit List<InputSplit> inputSplits = inputFormat.getSplits(job); if (inputSplits == null || inputSplits.size() == 0) { throw new IOException( "Given input format doesn't produce any input split. Can't sample first record. PATH: " + input); } InputSplit inputSplit = inputSplits.get(0); TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext; try { attemptContext = TaskAttemptContextFactory.get(conf, attemptId); } catch (Exception e) { throw new IOException(e); } RecordReader<ITuple, NullWritable> rReader = inputFormat.createRecordReader(inputSplit, attemptContext); rReader.initialize(inputSplit, attemptContext); if (!rReader.nextKeyValue()) { throw new IOException( "Can't read first record of first input split of the given path [" + input + "]."); } // finally get the sample schema schema = rReader.getCurrentKey().getSchema(); log.info("Sampled schema from [" + input + "] : " + schema); rReader.close(); return schema; }
From source file:com.splout.db.hadoop.TupleSampler.java
License:Apache License
public long sample(TablespaceSpec tablespace, Configuration hadoopConf, long sampleSize, Path outFile) throws TupleSamplerException { // 1 - Determine Input Splits // 2 - Launch sampling with the selected method // 3 - Recovering results List<InputSplit> splits = new ArrayList<InputSplit>(); Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat = new HashMap<InputSplit, InputFormat<ITuple, NullWritable>>(); Map<InputSplit, RecordProcessor> recordProcessorPerSplit = new HashMap<InputSplit, RecordProcessor>(); Map<InputSplit, Map<String, String>> specificHadoopConfMap = new HashMap<InputSplit, Map<String, String>>(); Map<InputSplit, TableSpec> splitToTableSpec = new HashMap<InputSplit, TableSpec>(); Map<InputSplit, JavascriptEngine> splitToJsEngine = new HashMap<InputSplit, JavascriptEngine>(); try {/*from w w w.j a v a 2 s . c o m*/ for (Table table : tablespace.getPartitionedTables()) { // Initialize JavaScript engine if needed JavascriptEngine jsEngine = null; TableSpec tableSpec = table.getTableSpec(); if (tableSpec.getPartitionByJavaScript() != null) { try { jsEngine = new JavascriptEngine(tableSpec.getPartitionByJavaScript()); } catch (Throwable e) { throw new RuntimeException(e); } } for (TableInput tableFile : table.getFiles()) { @SuppressWarnings("deprecation") Job job = new Job(hadoopConf); FileInputFormat.setInputPaths(job, tableFile.getPaths()); if (options.getMaxInputSplitSize() != null) { logger.info("Using max input split size: " + options.getMaxInputSplitSize()); FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize()); } job.setInputFormatClass(FileInputFormat.class); if (tableFile.getSpecificHadoopInputFormatContext() != null) { for (Map.Entry<String, String> specificHadoopConf : tableFile .getSpecificHadoopInputFormatContext().entrySet()) { job.getConfiguration().set(specificHadoopConf.getKey(), specificHadoopConf.getValue()); } } for (InputSplit split : tableFile.getFormat().getSplits(job)) { if (tableFile.getSpecificHadoopInputFormatContext() != null) { specificHadoopConfMap.put(split, tableFile.getSpecificHadoopInputFormatContext()); } splitToFormat.put(split, tableFile.getFormat()); recordProcessorPerSplit.put(split, tableFile.getRecordProcessor()); splitToTableSpec.put(split, tableSpec); splitToJsEngine.put(split, jsEngine); splits.add(split); } } } long retrievedSamples; if (samplingType.equals(SamplingType.RANDOM)) { try { RandomSamplingOptions defOptions = (RandomSamplingOptions) options; // Default sampling method retrievedSamples = randomSampling(sampleSize, hadoopConf, outFile, splits, splitToTableSpec, splitToFormat, specificHadoopConfMap, recordProcessorPerSplit, splitToJsEngine, defOptions.getMaxSplitsToVisit()); } catch (ClassCastException ef) { throw new RuntimeException("Invalid options class: " + options.getClass() + " Expected:" + RandomSamplingOptions.class); } } else { // Reservoir sampling over full data retrievedSamples = fullScanSampling(tablespace, sampleSize, hadoopConf, outFile, splits.size()); } return retrievedSamples; } catch (IOException e) { throw new TupleSamplerException(e); } catch (InterruptedException e) { throw new TupleSamplerException(e); } }
From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }/*from w ww . j a v a2 s . com*/ boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); FileOutputFormat.setCompressOutput(job, true); job.setInputFormatClass(NQuadsInputFormat.class); job.setMapperClass(CollationMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(CollationReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(QuadArrayWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : 1; }