List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.soteradefense.dga.LouvainRunner.java
License:Apache License
private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception { Configuration mrConf = new Configuration(); for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) { mrConf.set(entry.getKey(), entry.getValue()); }//from www . ja v a 2 s .c o m Job job = Job.getInstance(configuration); job.setJarByClass(LouvainRunner.class); Path in = new Path(inputPath); Path out = new Path(outputPath); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CommunityCompression"); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LouvainVertexWritable.class); job.setMapperClass(CommunityCompression.Map.class); job.setReducerClass(CommunityCompression.Reduce.class); logger.debug("Running Mapreduce step with job configuration: {}", job); return job.waitForCompletion(false) ? 0 : 1; }
From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java
License:Apache License
/** * Use this before submitting a TableMap job. It will appropriately set up * the job./*from w w w . jav a 2s . co m*/ * * @param table The Splice table name to read from. * @param scan The scan instance with the columns, time range etc. * @param mapper The mapper class to use. * @param outputKeyClass The class of the output key. * @param outputValueClass The class of the output value. * @param job The current job to adjust. Make sure the passed job is * carrying all necessary HBase configuration. * @param addDependencyJars upload HBase jars and jars for any of the configured * job classes via the distributed cache (tmpjars). * @throws IOException When setting up the details fails. */ public static void initTableMapperJob(String table, Scan scan, Class<? extends Mapper> mapper, Class<? extends WritableComparable> outputKeyClass, Class<? extends Object> outputValueClass, Job job, boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass) throws IOException { job.setInputFormatClass(inputFormatClass); if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass); if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass); if (mapper != null) job.setMapperClass(mapper); job.getConfiguration().set(MRConstants.SPLICE_INPUT_TABLE_NAME, table); job.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan)); if (addDependencyJars) { addDependencyJars(job); } }
From source file:com.splout.db.benchmark.IdentityJob.java
License:Apache License
@Override public int run(String[] params) throws Exception { // Validate params etc JCommander jComm = new JCommander(this); jComm.setProgramName("Identity Job"); try {//from w w w.j a v a 2 s.c o m jComm.parse(params); } catch (ParameterException e) { System.err.println(e.getMessage()); jComm.usage(); System.exit(-1); } Path outP = new Path(outputPath); HadoopUtils.deleteIfExists(FileSystem.get(conf), outP); if (pangoolSchema == null) { // Use plain Hadoop API Job job = new Job(conf); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outP); job.waitForCompletion(true); } else { if (groupBy == null) { System.err.println("If pangoolSchema is used, groupBy must also be used."); jComm.usage(); System.exit(-1); } Schema schema = new Schema("sch", Fields.parse(pangoolSchema)); Path inputP = new Path(inputPath); // Use Pangool API - parse CSV, etc TupleMRBuilder builder = new TupleMRBuilder(conf); TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false, separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null); TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0), quotes.charAt(0), escape.charAt(0)); builder.addIntermediateSchema(schema); builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper()); builder.setGroupByFields(groupBy); builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class); builder.setTupleReducer(new IdentityTupleReducer()); builder.setJarByClass(this.getClass()); builder.createJob().waitForCompletion(true); } return 1; }
From source file:com.splout.db.hadoop.TupleSampler.java
License:Apache License
public long sample(TablespaceSpec tablespace, Configuration hadoopConf, long sampleSize, Path outFile) throws TupleSamplerException { // 1 - Determine Input Splits // 2 - Launch sampling with the selected method // 3 - Recovering results List<InputSplit> splits = new ArrayList<InputSplit>(); Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat = new HashMap<InputSplit, InputFormat<ITuple, NullWritable>>(); Map<InputSplit, RecordProcessor> recordProcessorPerSplit = new HashMap<InputSplit, RecordProcessor>(); Map<InputSplit, Map<String, String>> specificHadoopConfMap = new HashMap<InputSplit, Map<String, String>>(); Map<InputSplit, TableSpec> splitToTableSpec = new HashMap<InputSplit, TableSpec>(); Map<InputSplit, JavascriptEngine> splitToJsEngine = new HashMap<InputSplit, JavascriptEngine>(); try {/*from ww w .jav a2 s .c o m*/ for (Table table : tablespace.getPartitionedTables()) { // Initialize JavaScript engine if needed JavascriptEngine jsEngine = null; TableSpec tableSpec = table.getTableSpec(); if (tableSpec.getPartitionByJavaScript() != null) { try { jsEngine = new JavascriptEngine(tableSpec.getPartitionByJavaScript()); } catch (Throwable e) { throw new RuntimeException(e); } } for (TableInput tableFile : table.getFiles()) { @SuppressWarnings("deprecation") Job job = new Job(hadoopConf); FileInputFormat.setInputPaths(job, tableFile.getPaths()); if (options.getMaxInputSplitSize() != null) { logger.info("Using max input split size: " + options.getMaxInputSplitSize()); FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize()); } job.setInputFormatClass(FileInputFormat.class); if (tableFile.getSpecificHadoopInputFormatContext() != null) { for (Map.Entry<String, String> specificHadoopConf : tableFile .getSpecificHadoopInputFormatContext().entrySet()) { job.getConfiguration().set(specificHadoopConf.getKey(), specificHadoopConf.getValue()); } } for (InputSplit split : tableFile.getFormat().getSplits(job)) { if (tableFile.getSpecificHadoopInputFormatContext() != null) { specificHadoopConfMap.put(split, tableFile.getSpecificHadoopInputFormatContext()); } splitToFormat.put(split, tableFile.getFormat()); recordProcessorPerSplit.put(split, tableFile.getRecordProcessor()); splitToTableSpec.put(split, tableSpec); splitToJsEngine.put(split, jsEngine); splits.add(split); } } } long retrievedSamples; if (samplingType.equals(SamplingType.RANDOM)) { try { RandomSamplingOptions defOptions = (RandomSamplingOptions) options; // Default sampling method retrievedSamples = randomSampling(sampleSize, hadoopConf, outFile, splits, splitToTableSpec, splitToFormat, specificHadoopConfMap, recordProcessorPerSplit, splitToJsEngine, defOptions.getMaxSplitsToVisit()); } catch (ClassCastException ef) { throw new RuntimeException("Invalid options class: " + options.getClass() + " Expected:" + RandomSamplingOptions.class); } } else { // Reservoir sampling over full data retrievedSamples = fullScanSampling(tablespace, sampleSize, hadoopConf, outFile, splits.size()); } return retrievedSamples; } catch (IOException e) { throw new TupleSamplerException(e); } catch (InterruptedException e) { throw new TupleSamplerException(e); } }
From source file:com.splunk.shuttl.integration.hadoop.hbase.CSVJobFactory.java
License:Apache License
/** * @return the hadoopConfiguration/* w w w . j av a 2 s . com*/ * @throws IOException */ public static Job getConfiguredJob(String[] arguments) throws IOException { Configuration jobConfiguration = new Configuration(true); // Load hbase-site.xml HBaseConfiguration.addHbaseResources(jobConfiguration); jobConfiguration.set("fs.default.name", arguments[0]); jobConfiguration.set("mapred.job.tracker", arguments[1]); jobConfiguration.set(JobConfigurationConstants.FILENAME, arguments[2]); jobConfiguration.set(JobConfigurationConstants.OUTPUT_PATH, arguments[3]); jobConfiguration.set(JobConfigurationConstants.TABLE_NAME, arguments[4]); jobConfiguration.set(JobConfigurationConstants.COLUMN_FAMILY, "d"); Job job = new Job(jobConfiguration, "BucketToHbase"); job.setJarByClass(CSVMapper.class); job.setMapperClass(CSVMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(TextInputFormat.class); return job; }
From source file:com.springsource.insight.plugin.hadoop.WordCount.java
License:Open Source License
public int run(String[] args) throws Exception { String INPUT = "src/test/resources"; String OUTPUT = "target/out"; Configuration conf = new Configuration(); File targetFolder = FileUtil.detectTargetFolder(getClass()); if (targetFolder == null) { throw new IllegalStateException("Cannot detect target folder"); }/*from w w w .j ava 2 s . c om*/ File tempFolder = new File(targetFolder, "temp"); conf.set("hadoop.tmp.dir", tempFolder.getAbsolutePath()); Job job = new Job(conf, "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountReducer.class); job.setReducerClass(WordCountReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileUtils.deleteDirectory(new File(OUTPUT)); // delete old output data FileInputFormat.addInputPath(job, new Path(INPUT)); FileOutputFormat.setOutputPath(job, new Path(OUTPUT)); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionBaseCreator.java
License:Apache License
@Override public Job call() throws Exception { // We're explicitly disabling speculative execution conf.set("mapreduce.map.speculative", "false"); conf.set("mapreduce.map.maxattempts", "1"); conf.set("mapreduce.job.user.classpath.first", "true"); conf.set("mapreduce.task.classpath.user.precedence", "true"); conf.set("mapreduce.task.classpath.first", "true"); addNecessaryJarsToJob(conf);//from w w w. j ava 2 s. com Job job = Job.getInstance(conf); // IO formats job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(NullOutputFormat.class); // Mapper & job output job.setMapperClass(getMapperClass()); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); // It's map only job job.setNumReduceTasks(0); // General configuration job.setJarByClass(getClass()); return job; }
From source file:com.stride.cartrek.core.hbase.RowKeyDistributorTestBase.java
License:Apache License
private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue, int seekIntervalMinValue, int seekIntervalMaxValue) throws IOException, InterruptedException, ClassNotFoundException { int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue, seekIntervalMinValue, seekIntervalMaxValue); // Reading data Configuration conf = testingUtility.getConfiguration(); Job job = new Job(conf, "testMapReduceInternal()-Job"); job.setJarByClass(this.getClass()); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); // Substituting standard TableInputFormat which was set in // TableMapReduceUtil.initTableMapperJob(...) job.setInputFormatClass(WdTableInputFormat.class); keyDistributor.addInfo(job.getConfiguration()); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0);/*from www. java 2 s . co m*/ boolean succeeded = job.waitForCompletion(true); Assert.assertTrue(succeeded); long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue(); Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords); }
From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java
License:Apache License
/** * Configures and submits the Map Reduce Job to Hadoop *///from w ww . ja v a 2 s . c o m public int run(String[] args) throws Exception { String inputPath = null; String outputPath = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. We're not using GenericOptionsParser // to prevent having to include commons.cli as a dependency. for (int index = 0; index < args.length; index++) { try { if (ARGNAME_INPATH.equals(args[index])) { inputPath = args[++index]; } else if (ARGNAME_OUTPATH.equals(args[index])) { outputPath = args[++index]; } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) { s3AccessKey = args[++index]; } else if (ARGNAME_S3SECRETKEY.equals(args[index])) { s3SecretKey = args[++index]; } else if (ARGNAME_MAXFILES.equals(args[index])) { // FIXME - No use of static methods WarcFileFilter.setMax(Long.parseLong(args[++index])); } else if (ARGNAME_OVERWRITE.equals(args[index])) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[index]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } if (inputPath == null || outputPath == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } // Create the Hadoop job. Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(GoogleAdsCounterJob.class); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } // Scan the provided input path for WARC files. LOG.info("setting input path to '" + inputPath + "'"); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.addInputPath(job, new Path(inputPath)); // FIXME - I see the problem that you want to give a dynamic number to a // static class. My question is, Is this really required, if we just // point to a file in s3 that should solve our problem FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } // Set the path where final output 'part' files will be saved. LOG.info("setting output path to '" + outputPath + "'"); FileOutputFormat.setOutputPath(job, new Path(outputPath)); /* * // Defines additional single text based output 'GoogleAdClient' for * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient", * TextOutputFormat.class, Text.class,LongWritable.class ); * * // Defines additional text based output 'GoogleAdType' for the job * MultipleOutputs.addNamedOutput(job, * "GoogleAdType",TextOutputFormat.class, Text.class, * LongWritable.class); */ // Set which InputFormat class to use. job.setInputFormatClass(WARCInputFormat.class); // Set which OutputFormat class to use. job.setOutputFormatClass(TextOutputFormat.class); /* * Using MultipleOutputs creates zero-sized default output e.g.: * * part-r-00000. To prevent this use LazyOutputFormat instead of * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job * configuration. */ // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // job.setPartitionerClass(GoogleAdsCounterPartitioner.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //job.setNumReduceTasks(4); // Set the output data types. job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Set which Mapper and Reducer classes to use. job.setMapperClass(GoogleAdsCounterMapper.class); // job.setMapperClass(CrawlMapper_AdStatsDetails.class); job.setReducerClass(GoogleAdsCounterReducer.class); // set combiner //job.setCombinerClass(GoogleAdsCounterReducer.class); // set job name job.setJobName("CommonCrawl Data Processing : Counting Google Ads"); long startTime = System.currentTimeMillis(); if (job.waitForCompletion(true)) { LOG.info("Job completion status : " + job.waitForCompletion(true)); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES); LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue()); Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES); LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue()); return 0; } else { return 1; } }
From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); }//from w w w . jav a 2 s . c o m boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName(JOB_NAME); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); FileOutputFormat.setCompressOutput(job, true); job.setInputFormatClass(NQuadsInputFormat.class); job.setMapperClass(CollationMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(CollationReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(QuadArrayWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); if (LOG.isDebugEnabled()) Utils.log(job, LOG); return job.waitForCompletion(true) ? 0 : 1; }