List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:com.rockstor.compact.GenGarbageIndexTool.java
License:Apache License
private Job createSubmittableJob(Configuration conf) throws IOException { Job job = new Job(conf, NAME); job.setJarByClass(GenGarbageIndexTool.class); Scan scan = new Scan(); TableMapReduceUtil.initTableMapperJob(GarbageChunkDB.TAB_NAME, scan, GarbageChunkMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job); TableMapReduceUtil.setScannerCaching(job, batchSize); job.setReducerClass(GarbageChunkReduce.class); job.setPartitionerClass(GarbageChunkPartition.class); job.setCombinerClass(GarbageChunkCombine.class); job.setNumReduceTasks(Compactor.getInstance().getReduceNum()); job.setOutputFormatClass(NullOutputFormat.class); LOG.info("init job " + NAME + " finished!"); return job;/*from w ww . j av a2s. co m*/ }
From source file:com.rockstor.compact.RecoveryTool.java
License:Apache License
private Job createSubmittableJob(Configuration conf) throws IOException { Job job = new Job(conf, NAME); job.setJarByClass(RecoveryTool.class); job.setInputFormatClass(CompactDirInputFormat.class); job.setMapOutputValueClass(NullWritable.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapperClass(RecoveryMapper.class); job.setNumReduceTasks(0);//from w w w. j a v a2s . c o m job.setOutputFormatClass(NullOutputFormat.class); LOG.info("init job " + NAME + " OK!"); return job; }
From source file:com.sa.npopa.samples.hbase.myMR.java
License:Apache License
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0];/*from www. j a va 2 s .c o m*/ Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(myMR.class); Scan scan = new Scan(); scan.setCacheBlocks(false); scan.setBatch(10); //scan.setFilter(new FirstKeyOnlyFilter()); //need to find another filter like key only. scan.setFilter(new KeyOnlyFilter()); job.setOutputFormatClass(NullOutputFormat.class); TableMapReduceUtil.initTableMapperJob(tableName, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); job.setNumReduceTasks(0); return job; }
From source file:com.sa.npopa.samples.hbase.RowCounter.java
License:Apache License
/** * Sets up the actual job.//from w w w . ja va2 s . c om * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; String startKey = null; String endKey = null; long startTime = 0; long endTime = 0; StringBuilder sb = new StringBuilder(); final String rangeSwitch = "--range="; final String startTimeArgKey = "--starttime="; final String endTimeArgKey = "--endtime="; final String expectedCountArg = "--expected-count="; // First argument is table name, starting from second for (int i = 1; i < args.length; i++) { if (args[i].startsWith(rangeSwitch)) { String[] startEnd = args[i].substring(rangeSwitch.length()).split(",", 2); if (startEnd.length != 2 || startEnd[1].contains(",")) { printUsage("Please specify range in such format as \"--range=a,b\" " + "or, with only one boundary, \"--range=,b\" or \"--range=a,\""); return null; } startKey = startEnd[0]; endKey = startEnd[1]; continue; } if (args[i].startsWith(startTimeArgKey)) { startTime = Long.parseLong(args[i].substring(startTimeArgKey.length())); continue; } if (args[i].startsWith(endTimeArgKey)) { endTime = Long.parseLong(args[i].substring(endTimeArgKey.length())); continue; } if (args[i].startsWith(expectedCountArg)) { conf.setLong(EXPECTED_COUNT_KEY, Long.parseLong(args[i].substring(expectedCountArg.length()))); continue; } // if no switch, assume column names sb.append(args[i]); sb.append(" "); } if (endTime < startTime) { printUsage("--endtime=" + endTime + " needs to be greater than --starttime=" + startTime); return null; } Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(RowCounter.class); Scan scan = new Scan(); scan.setCacheBlocks(false); if (startKey != null && !startKey.equals("")) { scan.setStartRow(Bytes.toBytes(startKey)); } if (endKey != null && !endKey.equals("")) { scan.setStopRow(Bytes.toBytes(endKey)); } if (sb.length() > 0) { for (String columnName : sb.toString().trim().split(" ")) { String family = StringUtils.substringBefore(columnName, ":"); String qualifier = StringUtils.substringAfter(columnName, ":"); if (StringUtils.isBlank(qualifier)) { scan.addFamily(Bytes.toBytes(family)); } else { scan.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier)); } } } scan.setFilter(new FirstKeyOnlyFilter()); scan.setTimeRange(startTime, endTime == 0 ? HConstants.LATEST_TIMESTAMP : endTime); job.setOutputFormatClass(NullOutputFormat.class); TableMapReduceUtil.initTableMapperJob(tableName, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); job.setNumReduceTasks(0); return job; }
From source file:com.savy3.nonequijoin.MapOutputSampler.java
License:Apache License
/** * Driver for InputSampler MapReduce Job *//*from www.java 2 s. c o m*/ public static void runMap(Job job, Path sampleInputPath) throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException { LOG.info("Running a MapReduce Job on Sample Input File" + sampleInputPath.toString()); Configuration conf = new Configuration(); conf.setBoolean("mapreduce.job.ubertask.enable", true); conf.set("numSamples", "" + (job.getNumReduceTasks() - 1)); Job sampleJob = new Job(conf); sampleJob.setMapperClass(job.getMapperClass()); sampleJob.setReducerClass(SampleKeyReducer.class); sampleJob.setJarByClass(job.getMapperClass()); sampleJob.setMapOutputKeyClass(job.getMapOutputKeyClass()); sampleJob.setMapOutputValueClass(job.getMapOutputValueClass()); sampleJob.setOutputKeyClass(job.getMapOutputKeyClass()); sampleJob.setOutputValueClass(NullWritable.class); sampleJob.setInputFormatClass(SequenceFileInputFormat.class); sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileInputFormat.addInputPath(sampleJob, sampleInputPath); FileSystem fs = FileSystem.get(conf); Path out = new Path(sampleInputPath.getParent(), "mapOut"); fs.delete(out, true); SequenceFileOutputFormat.setOutputPath(sampleJob, out); sampleJob.waitForCompletion(true); LOG.info("Sample MapReduce Job Output File" + out.toString()); Path partFile = new Path(out, "part-r-00000"); Path tmpFile = new Path("/_tmp"); fs.delete(tmpFile, true); fs.rename(partFile, tmpFile); fs.delete(sampleInputPath.getParent(), true); fs.rename(new Path("/_tmp"), sampleInputPath.getParent()); LOG.info("Sample partitioning file cpied to location " + sampleInputPath.getParent().toString()); }
From source file:com.scaleoutsoftware.soss.hserver.examples.NamedMapWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: wordcount <input map> <output map> <threshold>"); System.exit(2);//from w w w.ja va 2 s .co m } final int threshold = new Integer(otherArgs[2]); NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap(otherArgs[0], new WritableSerializer<IntWritable>(IntWritable.class), new WritableSerializer<Text>(Text.class)); NamedMap<Text, IntWritable> outputMap = NamedMapFactory.getMap(otherArgs[1], new WritableSerializer<Text>(Text.class), new WritableSerializer<IntWritable>(IntWritable.class)); //Create the invocation grid InvocationGrid grid = HServerJob.getInvocationGridBuilder("WordCountIG").addJar("wordcount.jar").load(); //Create hServer job Job job = new HServerJob(conf, "word count", false, grid); job.setJarByClass(NamedMapWordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(NamedMapInputFormat.class); job.setOutputFormatClass(GridOutputFormat.class); //Set named maps as input and output NamedMapInputFormat.setNamedMap(job, inputMap); GridOutputFormat.setNamedMap(job, outputMap); //Execute job job.waitForCompletion(true); //Assign invocation grid to the map, so parallel operation can be performed outputMap.setInvocationGrid(grid); //Run query to find words that are used more than threshold frequency Iterable<Text> words = outputMap.executeParallelQuery(new UsageFrequencyCondition(threshold)); //Unload the invocation grid grid.unload(); //Output resulting words and their frequencies System.out.println("Following words were used more than " + threshold + " times:"); for (Text word : words) { System.out.println("\"" + word.toString() + "\" was used " + outputMap.get(word) + " times."); } }
From source file:com.sematext.hbase.hut.UpdatesProcessingMrJob.java
License:Apache License
/** * Use this before submitting a TableMap job. It will appropriately set up * the job.//from w ww . j av a 2s. com * * @param table The table name. * @param scan The scan with the columns to scan. * @param up update processor implementation * @param job The job configuration. * @throws java.io.IOException When setting up the job fails. */ @SuppressWarnings("unchecked") public static void initJob(String table, Scan scan, UpdateProcessor up, Job job) throws IOException { TableMapReduceUtil.initTableMapperJob(table, scan, UpdatesProcessingMapper.class, null, null, job); job.setJarByClass(UpdatesProcessingMrJob.class); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); job.getConfiguration().set(UpdatesProcessingMapper.HTABLE_NAME_ATTR, table); job.getConfiguration().set(UpdatesProcessingMapper.HUT_PROCESSOR_CLASS_ATTR, up.getClass().getName()); job.getConfiguration().set(UpdatesProcessingMapper.HUT_PROCESSOR_DETAILS_ATTR, convertUpdateProcessorToString(up)); job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false"); // TODO: explain }
From source file:com.sematext.hbase.wd.RowKeyDistributorTestBase.java
License:Apache License
private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue, int seekIntervalMinValue, int seekIntervalMaxValue) throws IOException, InterruptedException, ClassNotFoundException { int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue, seekIntervalMinValue, seekIntervalMaxValue); // Reading data Configuration conf = testingUtility.getConfiguration(); Job job = new Job(conf, "testMapReduceInternal()-Job"); job.setJarByClass(this.getClass()); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...) job.setInputFormatClass(WdTableInputFormat.class); keyDistributor.addInfo(job.getConfiguration()); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0);/*from ww w . ja va 2 s.c om*/ boolean succeeded = job.waitForCompletion(true); Assert.assertTrue(succeeded); long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue(); Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords); }
From source file:com.sequenceiq.yarntest.mr.QuasiMonteCarlo.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi/*from ww w .j a v a2 s. com*/ */ public static JobID submitPiEstimationMRApp(String jobName, int numMaps, long numPoints, Path tmpDir, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf); //setup job conf job.setJobName(jobName); job.setJarByClass(QuasiMonteCarlo.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(LongWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(QmcMapper.class); job.setReducerClass(QmcReducer.class); job.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. job.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(tmpDir, "in"); final Path outDir = new Path(tmpDir, "out"); FileInputFormat.setInputPaths(job, inDir); FileOutputFormat.setOutputPath(job, outDir); final FileSystem fs = FileSystem.get(conf); if (fs.exists(tmpDir)) { fs.delete(tmpDir, true); // throw new IOException("Tmp directory " + fs.makeQualified(tmpDir) // + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } // try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } //start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); job.submit(); // final double duration = (System.currentTimeMillis() - startTime)/1000.0; // System.out.println("Job Finished in " + duration + " seconds"); return job.getJobID(); // } finally { // fs.delete(tmpDir, true); // } }
From source file:com.shmsoft.dmass.main.MRFreeEedProcess.java
License:Apache License
@Override public int run(String[] args) throws Exception { // inventory dir holds all package (zip) files resulting from stage String projectFileName = args[0]; String outputPath = args[1];/*from w ww . j av a 2 s. c o m*/ logger.info("Running Hadoop job"); logger.info("Input project file = " + projectFileName); logger.info("Output path = " + outputPath); // Hadoop configuration class Configuration configuration = getConf(); // No speculative execution! Do not process the same file twice configuration.set("mapred.reduce.tasks.speculative.execution", "false"); // TODO even in local mode, the first argument should not be the inventory // but write a complete project file instead Project project = Project.getProject(); if (project == null || project.isEmpty()) { // configure Hadoop input files System.out.println("Reading project file " + projectFileName); project = new Project().loadFromFile(new File(projectFileName)); Project.setProject(project); } project.setProperty(ParameterProcessing.OUTPUT_DIR_HADOOP, outputPath); // send complete project information to all mappers and reducers configuration.set(ParameterProcessing.PROJECT, project.toString()); Settings.load(); configuration.set(ParameterProcessing.SETTINGS_STR, Settings.getSettings().toString()); configuration.set(ParameterProcessing.METADATA_FILE, Files.toString(new File(ColumnMetadata.metadataNamesFile), Charset.defaultCharset())); Job job = new Job(configuration); job.setJarByClass(MRFreeEedProcess.class); job.setJobName("MRFreeEedProcess"); // Hadoop processes key-value pairs job.setOutputKeyClass(MD5Hash.class); job.setOutputValueClass(MapWritable.class); // set map and reduce classes job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); // Hadoop TextInputFormat class job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // String delim = "\u0001"; // configuration.set("mapred.textoutputformat.separator", delim); // configuration.set("mapreduce.output.textoutputformat.separator", delim); logger.debug("project.isEnvHadoop() = {} ", project.isEnvHadoop()); String inputPath = projectFileName; if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) { inputPath = formInputPath(project); } logger.debug("Ready to run, inputPath = {}, outputPath = {}", inputPath, outputPath); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); SHMcloudLogging.init(false); if (Settings.getSettings().isHadoopDebug()) { if (new File(outputPath).exists()) { Util.deleteDirectory(new File(outputPath)); } } SolrIndex.getInstance().init(); boolean success = job.waitForCompletion(true); if (project.isEnvHadoop() && project.isFsS3()) { transferResultsToS3(outputPath); } SolrIndex.getInstance().destroy(); return success ? 0 : 1; }