List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:com.nearinfinity.blur.mapreduce.example.BlurExampleIndexerRebuild.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration configuration = new Configuration(); String[] otherArgs = new GenericOptionsParser(configuration, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: blurindexer <in> <out>"); System.exit(2);/*from w ww .ja va 2s .c om*/ } AnalyzerDefinition ad = new AnalyzerDefinition(); ad.defaultDefinition = new ColumnDefinition(StandardAnalyzer.class.getName(), true, null); TableDescriptor descriptor = new TableDescriptor(); descriptor.analyzerDefinition = ad; descriptor.compressionBlockSize = 32768; descriptor.compressionClass = DefaultCodec.class.getName(); descriptor.isEnabled = true; descriptor.name = "test-table"; descriptor.shardCount = 1; descriptor.cluster = "default"; descriptor.tableUri = "./blur-testing"; BlurTask blurTask = new BlurTask(); blurTask.setTableDescriptor(descriptor); blurTask.setIndexingType(INDEXING_TYPE.REBUILD); blurTask.setOptimize(false); Job job = blurTask.configureJob(configuration); job.setJarByClass(BlurExampleIndexerRebuild.class); job.setMapperClass(BlurExampleMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1], "job-" + System.currentTimeMillis())); long s = System.currentTimeMillis(); boolean waitForCompletion = job.waitForCompletion(true); long e = System.currentTimeMillis(); System.out.println("Completed in [" + (e - s) + " ms]"); System.exit(waitForCompletion ? 0 : 1); }
From source file:com.nearinfinity.blur.mapreduce.example.BlurExampleIndexerUpdate.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration configuration = new Configuration(); String[] otherArgs = new GenericOptionsParser(configuration, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: blurindexer <in> <out>"); System.exit(2);// w w w .j a va2 s. c om } AnalyzerDefinition ad = new AnalyzerDefinition(); ad.defaultDefinition = new ColumnDefinition(StandardAnalyzer.class.getName(), true, null); ZookeeperClusterStatus status = new ZookeeperClusterStatus("localhost"); TableDescriptor descriptor = status.getTableDescriptor(false, "default", "test-table"); BlurTask blurTask = new BlurTask(); blurTask.setTableDescriptor(descriptor); blurTask.setIndexingType(INDEXING_TYPE.UPDATE); Job job = blurTask.configureJob(configuration); job.setJarByClass(BlurExampleIndexerUpdate.class); job.setMapperClass(BlurExampleMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1], "job-" + System.currentTimeMillis())); boolean waitForCompletion = job.waitForCompletion(true); System.exit(waitForCompletion ? 0 : 1); }
From source file:com.netflix.Aegisthus.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); job.setJarByClass(Aegisthus.class); CommandLine cl = getOptions(args);/*from w w w. ja va 2 s . co m*/ if (cl == null) { return 1; } job.setInputFormatClass(AegisthusInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(CassReducer.class); List<Path> paths = Lists.newArrayList(); if (cl.hasOption(OPT_INPUT)) { for (String input : cl.getOptionValues(OPT_INPUT)) { paths.add(new Path(input)); } } if (cl.hasOption(OPT_INPUTDIR)) { paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(OPT_INPUTDIR))); } TextInputFormat.setInputPaths(job, paths.toArray(new Path[0])); TextOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(OPT_OUTPUT))); job.submit(); System.out.println(job.getJobID()); System.out.println(job.getTrackingURL()); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:com.netflix.bdp.s3.TestMRJob.java
License:Apache License
@Test public void testMRJob() throws Exception { FileSystem mockS3 = mock(FileSystem.class); FileSystem s3 = S3_OUTPUT_PATH.getFileSystem(getConfiguration()); if (s3 instanceof MockS3FileSystem) { ((MockS3FileSystem) s3).setMock(mockS3); } else {/*from w ww .ja v a2s. c o m*/ throw new RuntimeException("Cannot continue: S3 not mocked"); } String commitUUID = UUID.randomUUID().toString(); int numFiles = 3; Set<String> expectedFiles = Sets.newHashSet(); for (int i = 0; i < numFiles; i += 1) { File file = temp.newFile(String.valueOf(i) + ".text"); try (FileOutputStream out = new FileOutputStream(file)) { out.write(("file " + i).getBytes(StandardCharsets.UTF_8)); } expectedFiles.add(new Path(S3_OUTPUT_PATH, "part-m-0000" + i + "-" + commitUUID).toString()); } Job mrJob = Job.getInstance(MR_CLUSTER.getConfig(), "test-committer-job"); Configuration conf = mrJob.getConfiguration(); mrJob.setOutputFormatClass(S3TextOutputFormat.class); S3TextOutputFormat.setOutputPath(mrJob, S3_OUTPUT_PATH); File mockResultsFile = temp.newFile("committer.bin"); mockResultsFile.delete(); String committerPath = "file:" + mockResultsFile; conf.set("mock-results-file", committerPath); conf.set(UPLOAD_UUID, commitUUID); mrJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(mrJob, new Path("file:" + temp.getRoot().toString())); mrJob.setMapperClass(M.class); mrJob.setNumReduceTasks(0); mrJob.submit(); Assert.assertTrue("MR job should succeed", mrJob.waitForCompletion(true)); TestUtil.ClientResults results; try (ObjectInputStream in = new ObjectInputStream( FileSystem.getLocal(conf).open(new Path(committerPath)))) { results = (TestUtil.ClientResults) in.readObject(); } Assert.assertEquals("Should not delete files", 0, results.deletes.size()); Assert.assertEquals("Should not abort commits", 0, results.aborts.size()); Assert.assertEquals("Should commit task output files", numFiles, results.commits.size()); Set<String> actualFiles = Sets.newHashSet(); for (CompleteMultipartUploadRequest commit : results.commits) { actualFiles.add("s3://" + commit.getBucketName() + "/" + commit.getKey()); } Assert.assertEquals("Should commit the correct file paths", expectedFiles, actualFiles); }
From source file:com.neusoft.hbase.test.hadoop.dataload.HFileOutputFormat2.java
License:Apache License
static void configureIncrementalLoad(Job job, HTable table, Class<? extends OutputFormat<?, ?>> cls) throws IOException { Configuration conf = job.getConfiguration(); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setOutputFormatClass(HFileOutputFormat2.class); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(PutSortReducer.class); } else if (Text.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(TextSortReducer.class); } else {/*from www . j a v a 2 s. c o m*/ LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName()); // Use table's region boundaries for TOP split points. LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName())); List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); job.setNumReduceTasks(startKeys.size()); configurePartitioner(job, startKeys); // Set compression algorithms based on column families configureCompression(table, conf); configureBloomType(table, conf); configureBlockSize(table, conf); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); LOG.info("Incremental table " + Bytes.toString(table.getTableName()) + " output configured."); }
From source file:com.neusoft.hbase.test.hadoop.dataload.HFileOutputFormatBase.java
License:Apache License
public static void configureIncrementalLoad(Job job, HTable table, Class<? extends OutputFormat<?, ?>> cls) throws IOException { Configuration conf = job.getConfiguration(); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setOutputFormatClass(HFileOutputFormatBase.class); // Based on the configured map output class, set the correct reducer to // properly/*from w ww . j av a2s. c o m*/ // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(PutSortReducer.class); } else if (Text.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(TextSortReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName()); // Use table's region boundaries for TOP split points. LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName())); List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); job.setNumReduceTasks(startKeys.size()); configurePartitioner(job, startKeys); // Set compression algorithms based on column families configureCompression(table, conf); configureBloomType(table, conf); configureBlockSize(table, conf); // TableMapReduceUtil.addDependencyJars(job);// TableMapReduceUtil.initCredentials(job); LOG.info("Incremental table " + Bytes.toString(table.getTableName()) + " output configured."); }
From source file:com.ngdata.hbaseindexer.mr.HBaseMapReduceIndexerTool.java
License:Apache License
public int run(HBaseIndexingOptions hbaseIndexingOpts, JobProcessCallback callback) throws Exception { if (hbaseIndexingOpts.isDryRun) { return new IndexerDryRun(hbaseIndexingOpts, getConf(), System.out).run(); }/* w ww . j ava 2 s . c o m*/ long programStartTime = System.currentTimeMillis(); Configuration conf = getConf(); IndexingSpecification indexingSpec = hbaseIndexingOpts.getIndexingSpecification(); conf.set(HBaseIndexerMapper.INDEX_COMPONENT_FACTORY_KEY, indexingSpec.getIndexerComponentFactory()); conf.set(HBaseIndexerMapper.INDEX_CONFIGURATION_CONF_KEY, new String(indexingSpec.getConfiguration(), Charsets.UTF_8)); conf.set(HBaseIndexerMapper.INDEX_NAME_CONF_KEY, indexingSpec.getIndexerName()); conf.set(HBaseIndexerMapper.TABLE_NAME_CONF_KEY, indexingSpec.getTableName()); HBaseIndexerMapper.configureIndexConnectionParams(conf, indexingSpec.getIndexConnectionParams()); IndexerComponentFactory factory = IndexerComponentFactoryUtil.getComponentFactory( indexingSpec.getIndexerComponentFactory(), new ByteArrayInputStream(indexingSpec.getConfiguration()), indexingSpec.getIndexConnectionParams()); IndexerConf indexerConf = factory.createIndexerConf(); Map<String, String> params = indexerConf.getGlobalParams(); String morphlineFile = params.get(MorphlineResultToSolrMapper.MORPHLINE_FILE_PARAM); if (hbaseIndexingOpts.morphlineFile != null) { morphlineFile = hbaseIndexingOpts.morphlineFile.getPath(); } if (morphlineFile != null) { conf.set(MorphlineResultToSolrMapper.MORPHLINE_FILE_PARAM, new File(morphlineFile).getName()); ForkedMapReduceIndexerTool.addDistributedCacheFile(new File(morphlineFile), conf); } String morphlineId = params.get(MorphlineResultToSolrMapper.MORPHLINE_ID_PARAM); if (hbaseIndexingOpts.morphlineId != null) { morphlineId = hbaseIndexingOpts.morphlineId; } if (morphlineId != null) { conf.set(MorphlineResultToSolrMapper.MORPHLINE_ID_PARAM, morphlineId); } conf.setBoolean(HBaseIndexerMapper.INDEX_DIRECT_WRITE_CONF_KEY, hbaseIndexingOpts.isDirectWrite()); if (hbaseIndexingOpts.fairSchedulerPool != null) { conf.set("mapred.fairscheduler.pool", hbaseIndexingOpts.fairSchedulerPool); } // switch off a false warning about allegedly not implementing Tool // also see http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html // also see https://issues.apache.org/jira/browse/HADOOP-8183 getConf().setBoolean("mapred.used.genericoptionsparser", true); if (hbaseIndexingOpts.log4jConfigFile != null) { Utils.setLogConfigFile(hbaseIndexingOpts.log4jConfigFile, getConf()); ForkedMapReduceIndexerTool.addDistributedCacheFile(hbaseIndexingOpts.log4jConfigFile, conf); } Job job = Job.getInstance(getConf()); job.setJobName(getClass().getSimpleName() + "/" + HBaseIndexerMapper.class.getSimpleName()); job.setJarByClass(HBaseIndexerMapper.class); // job.setUserClassesTakesPrecedence(true); TableMapReduceUtil.initTableMapperJob(hbaseIndexingOpts.getScans(), HBaseIndexerMapper.class, Text.class, SolrInputDocumentWritable.class, job); // explicitely set hbase configuration on the job because the TableMapReduceUtil overwrites it with the hbase defaults // (see HBASE-4297 which is not really fixed in hbase 0.94.6 on all code paths) HBaseConfiguration.merge(job.getConfiguration(), getConf()); int mappers = new JobClient(job.getConfiguration()).getClusterStatus().getMaxMapTasks(); // MR1 //mappers = job.getCluster().getClusterStatus().getMapSlotCapacity(); // Yarn only LOG.info("Cluster reports {} mapper slots", mappers); LOG.info("Using these parameters: " + "reducers: {}, shards: {}, fanout: {}, maxSegments: {}", new Object[] { hbaseIndexingOpts.reducers, hbaseIndexingOpts.shards, hbaseIndexingOpts.fanout, hbaseIndexingOpts.maxSegments }); if (hbaseIndexingOpts.isDirectWrite()) { CloudSolrServer solrServer = new CloudSolrServer(hbaseIndexingOpts.zkHost); solrServer.setDefaultCollection(hbaseIndexingOpts.collection); if (hbaseIndexingOpts.clearIndex) { clearSolr(indexingSpec.getIndexConnectionParams()); } // Run a mapper-only MR job that sends index documents directly to a live Solr instance. job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); job.submit(); callback.jobStarted(job.getJobID().toString(), job.getTrackingURL()); if (!ForkedMapReduceIndexerTool.waitForCompletion(job, hbaseIndexingOpts.isVerbose)) { return -1; // job failed } commitSolr(indexingSpec.getIndexConnectionParams()); ForkedMapReduceIndexerTool.goodbye(job, programStartTime); return 0; } else { FileSystem fileSystem = FileSystem.get(getConf()); if (fileSystem.exists(hbaseIndexingOpts.outputDir)) { if (hbaseIndexingOpts.overwriteOutputDir) { LOG.info("Removing existing output directory {}", hbaseIndexingOpts.outputDir); if (!fileSystem.delete(hbaseIndexingOpts.outputDir, true)) { LOG.error("Deleting output directory '{}' failed", hbaseIndexingOpts.outputDir); return -1; } } else { LOG.error("Output directory '{}' already exists. Run with --overwrite-output-dir to " + "overwrite it, or remove it manually", hbaseIndexingOpts.outputDir); return -1; } } int exitCode = ForkedMapReduceIndexerTool.runIndexingPipeline(job, callback, getConf(), hbaseIndexingOpts.asOptions(), programStartTime, fileSystem, null, -1, // File-based parameters -1, // num mappers, only of importance for file-based indexing hbaseIndexingOpts.reducers); if (hbaseIndexingOpts.isGeneratedOutputDir()) { LOG.info("Deleting generated output directory " + hbaseIndexingOpts.outputDir); fileSystem.delete(hbaseIndexingOpts.outputDir, true); } return exitCode; } }
From source file:com.nistfortunetellers.cleaning.NISTClean.java
License:Apache License
/** Runs a Job that is Text in and Out, and TextInput in and out, too! */ @SuppressWarnings({ "deprecation", "rawtypes" }) static void runTextJob(String jobName, Configuration jobConfig, String inputPath, String outputPath, Class<? extends Mapper> mapper, Class<? extends Reducer> reducer) { try {/* ww w . ja va 2s .c o m*/ Job genericJob = new Job(jobConfig, jobName); // DEBUG //genericJob.setNumReduceTasks(0); // END DEBUG genericJob.setJarByClass(NISTClean.class); genericJob.setOutputKeyClass(Text.class); genericJob.setOutputValueClass(Text.class); genericJob.setMapperClass(mapper); genericJob.setReducerClass(reducer); genericJob.setInputFormatClass(TextInputFormat.class); genericJob.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(genericJob, new Path(inputPath)); FileOutputFormat.setOutputPath(genericJob, new Path(outputPath)); genericJob.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } }
From source file:com.nnapz.hbaseexplorer.mr.TableStats.java
License:Apache License
/** * M/R Job setup. No reduce./*ww w .ja v a 2 s. c om*/ * * @param conf a suitable hadoop+hbase configuration * @param tableName the table we want to get stats from * @return the Job object, to be started * @throws java.io.IOException any hadoop IO problem */ public static Job createSubmittableJob(Configuration conf, String tableName) throws IOException { Job job = new Job(conf, NAME + "_" + tableName); if (job.getJar() == null) { job.setJarByClass(TableStats.class); // otherwise set in conf already } Scan scan = new Scan(); scan.setMaxVersions(10000); // todo fixme TableMapReduceUtil.initTableMapperJob(tableName, scan, RowCountMapper.class, Text.class, Result.class, job); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); return job; }
From source file:com.panguso.lc.analysis.format.Logcenter.java
License:Open Source License
@Override public int run(String[] args) throws Exception { context = new ClassPathXmlApplicationContext("applicationContext.xml"); Properties prop = context.getBean("configProperties", Properties.class); // ??// w ww .j av a 2 s . co m // String time = new DateTime().toString("yyyyMMddHH"); // hadoop.lib=/application/format/lib/ // hadoop.conf=/application/format/conf/ // hadoop.src=/log/src/ // hadoop.dest=/log/dest/ // hadoop.archive=/log/archive/ libPath = prop.getProperty("hadoop.lib"); confPath = prop.getProperty("hadoop.conf"); srcPath = prop.getProperty("hadoop.src"); destPath = prop.getProperty("hadoop.dest"); archivePath = prop.getProperty("hadoop.archive"); Configuration conf = getConf(); logger.info("libPath=" + libPath); logger.info("confPath=" + confPath); logger.info("srcPath=" + srcPath); logger.info("destPath=" + destPath); logger.info("archivePath=" + archivePath); FileSystem fs = FileSystem.get(conf); // --jar FileStatus[] fJars = fs.listStatus(new Path(libPath)); for (FileStatus fileStatus : fJars) { String jar = libPath + fileStatus.getPath().getName(); DistributedCache.addFileToClassPath(new Path(jar), conf, FileSystem.get(conf)); } // --? FileStatus[] fProp = fs.listStatus(new Path(confPath)); for (FileStatus fileStatus : fProp) { DistributedCache.addArchiveToClassPath(new Path(confPath + fileStatus.getPath().getName()), conf, FileSystem.get(conf)); } FileStatus[] fDirs = fs.listStatus(new Path(srcPath)); if (fDirs != null && fDirs.length > 0) { for (FileStatus file : fDirs) { // dir String currentTime = file.getPath().getName(); String srcPathWithTime = srcPath + currentTime + "/"; String destPathWithTime = destPath + currentTime + "/"; String archPathWithTime = archivePath + currentTime + "/"; // ?? if (analysisService.isSuccessful(currentTime)) { continue; } // ??job? // fs.delete(new Path(destPathWithTime), true); // ? // if (!fs.exists(new Path(srcPathWithTime))) { // logger.warn("outPath does not exist,inputPath=" + // srcPathWithTime); // analysisService.saveFailureJob(job.getJobName(), // currentTime); // return -1; // } // ?classpath";"":" Job job = new Job(conf); String jars = job.getConfiguration().get("mapred.job.classpath.files"); job.getConfiguration().set("mapred.job.classpath.files", jars.replace(";", ":")); logger.info("current dir=" + currentTime); job.setJobName("format_" + currentTime); job.setJarByClass(Logcenter.class); job.setMapperClass(FormatAnalysisMapper.class); job.setReducerClass(FormatAnalysisReducer.class); job.setCombinerClass(FormatAnalysisReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); // job.setNumReduceTasks(0); // //??reduce????namenode FileInputFormat.addInputPath(job, new Path(srcPathWithTime)); FileOutputFormat.setOutputPath(job, new Path(destPathWithTime)); // ? boolean result = false; try { result = job.waitForCompletion(true); } catch (FileAlreadyExistsException e) { logger.warn(e.getMessage(), e); } if (!result) { logger.warn("job execute failure!"); analysisService.saveFailureJob(job.getJobName(), currentTime); continue; // return -1; } // , fs.delete(new Path(archPathWithTime), true); fs.rename(new Path(srcPathWithTime), new Path(archPathWithTime)); analysisService.saveSuccessJob(job.getJobName(), currentTime); } } FileSystem.closeAll(); return 0; }