List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.cloudera.recordservice.pig.HCatRSLoader.java
License:Apache License
@Override public void setLocation(String location, Job job) throws IOException { HCatContext.INSTANCE.setConf(job.getConfiguration()).getConf().get() .setBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, true); UDFContext udfContext = UDFContext.getUDFContext(); Properties udfProps = udfContext.getUDFProperties(this.getClass(), new String[] { signature }); job.getConfiguration().set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + signature); RequiredFieldList requiredFieldsInfo = (RequiredFieldList) udfProps.get(PRUNE_PROJECTION_INFO); // get partitionFilterString stored in the UDFContext - it would have // been stored there by an earlier call to setPartitionFilter // call setInput on HCatInputFormat only in the frontend because internally // it makes calls to the hcat server - we don't want these to happen in // the backend // in the hadoop front end mapred.task.id property will not be set in // the Configuration if (udfProps.containsKey(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET)) { for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements();) { PigHCatUtil.getConfigFromUDFProperties(udfProps, job.getConfiguration(), emr.nextElement().toString()); }//from w w w .j a v a 2 s . c o m if (!HCatUtil.checkJobContextIfRunningFromBackend(job)) { //Combine credentials and credentials from job takes precedence for freshness Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + signature); job.getCredentials().addAll(crd); } } else { Job clone = new Job(job.getConfiguration()); HCatRSInputFormat.setInput(job, location, getPartitionFilterString()); InputJobInfo inputJobInfo = (InputJobInfo) HCatRSUtil .deserialize(job.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO)); // TODO: Add back special cases call when I find out where the code has moved. addSpecialCasesParametersForHCatLoader(job.getConfiguration(), inputJobInfo.getTableInfo()); // We will store all the new /changed properties in the job in the // udf context, so the the HCatInputFormat.setInput method need not //be called many times. for (Entry<String, String> keyValue : job.getConfiguration()) { String oldValue = clone.getConfiguration().getRaw(keyValue.getKey()); if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) { udfProps.put(keyValue.getKey(), keyValue.getValue()); } } udfProps.put(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET, true); //Store credentials in a private hash map and not the udf context to // make sure they are not public. Credentials crd = new Credentials(); crd.addAll(job.getCredentials()); jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + signature, crd); clone.setInputFormatClass(HCatRSInputFormat.class); } // Need to also push projections by calling setOutputSchema on // HCatInputFormat - we have to get the RequiredFields information // from the UdfContext, translate it to an Schema and then pass it // The reason we do this here is because setLocation() is called by // Pig runtime at InputFormat.getSplits() and // InputFormat.createRecordReader() time - we are not sure when // HCatInputFormat needs to know about pruned projections - so doing it // here will ensure we communicate to HCatInputFormat about pruned // projections at getSplits() and createRecordReader() time if (requiredFieldsInfo != null) { // convert to hcatschema and pass to HCatInputFormat try { outputSchema = phutil.getHCatSchema(requiredFieldsInfo.getFields(), signature, this.getClass()); HCatRSInputFormat.setOutputSchema(job, outputSchema); } catch (Exception e) { throw new IOException(e); } } else { // else - this means pig's optimizer never invoked the pushProjection // method - so we need all fields and hence we should not call the // setOutputSchema on HCatInputFormat if (HCatUtil.checkJobContextIfRunningFromBackend(job)) { try { HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA); outputSchema = hcatTableSchema; HCatRSInputFormat.setOutputSchema(job, outputSchema); } catch (Exception e) { throw new IOException(e); } } } if (LOG.isDebugEnabled()) { LOG.debug("outputSchema=" + outputSchema); } job.setInputFormatClass(HCatRSInputFormat.class); }
From source file:com.cloudera.sa.hbasebulkload.HBASEBulkLoadDriver.java
@Override public int run(String[] args) throws Exception { Configuration config = getConf(); args = new GenericOptionsParser(config, args).getRemainingArgs(); if (args.length < 6) { /*System.out.println("hadoop jar HBASEBulkLoad.jar " + "com.cloudera.sa.hbasebulkload.HBASEBulkLoadDriver" + " <inputpath> <outputpath> <hbaseTable> <hbaseColumnFamily" + " \"<hbaseColumns (delimiter seperated)>\" <column delimiter>");*/ ToolRunner.printGenericCommandUsage(System.out); return 2; }/* ww w . j ava 2s . c om*/ String hbaseTab = args[2]; String hbaseColumnFamily = args[3]; String hbaseColumns = args[4]; String hbaseColumnSeperator = args[5]; config.set(HBASEBulkLoadConstants.HBASE_TABLE_KEY, hbaseTab.trim().toLowerCase(Locale.ENGLISH)); config.set(HBASEBulkLoadConstants.HBASE_COLUMN_FAMILY_KEY, hbaseColumnFamily); config.set(HBASEBulkLoadConstants.HBASE_COLUMNS_KEY, hbaseColumns.trim().toLowerCase(Locale.ENGLISH)); config.set(HBASEBulkLoadConstants.HBASE_COLUMN_SEPERATOR_KEY, hbaseColumnSeperator); System.out.println(2); Job job = Job.getInstance(config, this.getClass().getName() + "-" + hbaseTab); HBaseConfiguration.addHbaseResources(config); job.setInputFormatClass(TextInputFormat.class); job.setJarByClass(HBASEBulkLoadDriver.class); job.setMapperClass(HBASEBulkLoadKeyValueMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); job.setCombinerClass(PutCombiner.class); job.setReducerClass(PutSortReducer.class); Connection connection = ConnectionFactory.createConnection(config); Table hTab = connection.getTable(TableName.valueOf(hbaseTab)); FileSystem.get(getConf()).delete(new Path(args[1]), true); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //job.setOutputFormatClass(HFileOutputFormat2.class); TableMapReduceUtil.initTableReducerJob(hTab.getName().getNameAsString(), null, job); //job.setNumReduceTasks(0); TableMapReduceUtil.addDependencyJars(job); HFileOutputFormat2.configureIncrementalLoadMap(job, hTab); int exitCode = job.waitForCompletion(true) ? HBASEBulkLoadConstants.SUCCESS : HBASEBulkLoadConstants.FAILURE; System.out.println(8); if (HBASEBulkLoadConstants.SUCCESS == exitCode) { LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config); loader.doBulkLoad(new Path(args[1]), (HTable) hTab); connection.close(); } return exitCode; }
From source file:com.cloudera.sa.securewordcount.SecureWordCountDriver.java
@Override public int run(String[] args) throws Exception { Configuration config = getConf(); args = new GenericOptionsParser(config, args).getRemainingArgs(); if (args.length < 2) { ToolRunner.printGenericCommandUsage(System.out); return 2; }/*from w w w. j av a 2 s . c o m*/ Job job = Job.getInstance(config, this.getClass().getName() + "-wordcount"); job.setJarByClass(SecureWordCountDriver.class); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.science.ml.hcatalog.HCatalogSource.java
License:Open Source License
@Override public void configureSource(Job job, int inputId) throws IOException { if (inputId == -1) { HCatInputFormat.setInput(job, info); job.setInputFormatClass(HCatInputFormat.class); } else {//from ww w. j a v a 2 s . c o m FormatBundle<HCatInputFormat> bundle = FormatBundle.forInput(HCatInputFormat.class); bundle.set(HCatConstants.HCAT_KEY_JOB_INFO, HCatUtil.serialize(info)); CrunchInputs.addInputPath(job, new Path(info.getTableName()), bundle, inputId); } }
From source file:com.cloudera.sqoop.mapreduce.MergeJob.java
License:Apache License
public boolean runMergeJob() throws IOException { Configuration conf = options.getConf(); Job job = new Job(conf); String userClassName = options.getClassName(); if (null == userClassName) { // Shouldn't get here. throw new IOException("Record class name not specified with " + "--class-name."); }/*w w w . j a v a2 s. c om*/ // Set the external jar to use for the job. String existingJar = options.getExistingJarName(); if (existingJar != null) { // User explicitly identified a jar path. LOG.debug("Setting job jar to user-specified jar: " + existingJar); job.getConfiguration().set("mapred.jar", existingJar); } else { // Infer it from the location of the specified class, if it's on the // classpath. try { Class<? extends Object> userClass = conf.getClassByName(userClassName); if (null != userClass) { String userJar = Jars.getJarPathForClass(userClass); LOG.debug("Setting job jar based on user class " + userClassName + ": " + userJar); job.getConfiguration().set("mapred.jar", userJar); } else { LOG.warn("Specified class " + userClassName + " is not in a jar. " + "MapReduce may not find the class"); } } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } } try { Path oldPath = new Path(options.getMergeOldPath()); Path newPath = new Path(options.getMergeNewPath()); Configuration jobConf = job.getConfiguration(); FileSystem fs = FileSystem.get(jobConf); oldPath = oldPath.makeQualified(fs); newPath = newPath.makeQualified(fs); FileInputFormat.addInputPath(job, oldPath); FileInputFormat.addInputPath(job, newPath); jobConf.set(MERGE_OLD_PATH_KEY, oldPath.toString()); jobConf.set(MERGE_NEW_PATH_KEY, newPath.toString()); jobConf.set(MERGE_KEY_COL_KEY, options.getMergeKeyCol()); jobConf.set(MERGE_SQOOP_RECORD_KEY, userClassName); FileOutputFormat.setOutputPath(job, new Path(options.getTargetDir())); if (ExportJobBase.isSequenceFiles(jobConf, newPath)) { job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(MergeRecordMapper.class); } else { job.setMapperClass(MergeTextMapper.class); job.setOutputFormatClass(RawKeyTextOutputFormat.class); } jobConf.set("mapred.output.key.class", userClassName); job.setOutputValueClass(NullWritable.class); job.setReducerClass(MergeReducer.class); // Set the intermediate data types. job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(MergeRecord.class); // Make sure Sqoop and anything else we need is on the classpath. cacheJars(job, null); return this.runJob(job); } catch (InterruptedException ie) { throw new IOException(ie); } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } }
From source file:com.cloudera.sqoop.mapreduce.MySQLDumpImportJob.java
License:Apache License
/** * Configure the inputformat to use for the job. */// www. j a v a2s .c om protected void configureInputFormat(Job job, String tableName, String tableClassName, String splitByCol) throws ClassNotFoundException, IOException { if (null == tableName) { LOG.error("mysqldump-based import cannot support free-form query imports."); LOG.error("Do not use --direct and --query together for MySQL."); throw new IOException("null tableName for MySQLDumpImportJob."); } ConnManager mgr = getContext().getConnManager(); String username = options.getUsername(); if (null == username || username.length() == 0) { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString()); } else { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString(), username, options.getPassword()); } String[] colNames = options.getColumns(); if (null == colNames) { colNames = mgr.getColumnNames(tableName); } String[] sqlColNames = null; if (null != colNames) { sqlColNames = new String[colNames.length]; for (int i = 0; i < colNames.length; i++) { sqlColNames[i] = mgr.escapeColName(colNames[i]); } } // It's ok if the where clause is null in DBInputFormat.setInput. String whereClause = options.getWhereClause(); // We can't set the class properly in here, because we may not have the // jar loaded in this JVM. So we start by calling setInput() with // DBWritable and then overriding the string manually. // Note that mysqldump also does *not* want a quoted table name. DataDrivenDBInputFormat.setInput(job, DBWritable.class, tableName, whereClause, mgr.escapeColName(splitByCol), sqlColNames); Configuration conf = job.getConfiguration(); conf.setInt(MySQLUtils.OUTPUT_FIELD_DELIM_KEY, options.getOutputFieldDelim()); conf.setInt(MySQLUtils.OUTPUT_RECORD_DELIM_KEY, options.getOutputRecordDelim()); conf.setInt(MySQLUtils.OUTPUT_ENCLOSED_BY_KEY, options.getOutputEnclosedBy()); conf.setInt(MySQLUtils.OUTPUT_ESCAPED_BY_KEY, options.getOutputEscapedBy()); conf.setBoolean(MySQLUtils.OUTPUT_ENCLOSE_REQUIRED_KEY, options.isOutputEncloseRequired()); String[] extraArgs = options.getExtraArgs(); if (null != extraArgs) { conf.setStrings(MySQLUtils.EXTRA_ARGS_KEY, extraArgs); } LOG.debug("Using InputFormat: " + inputFormatClass); job.setInputFormatClass(getInputFormatClass()); }
From source file:com.cloudera.test.UseHCat.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); args = new GenericOptionsParser(conf, args).getRemainingArgs(); // Get the input and output table names as arguments String inputTableName = args[0]; String outputTableName = args[1]; // Assume the default database String dbName = null;/*w w w .j ava 2 s.co m*/ Job job = new Job(conf, "UseHCat"); HCatInputFormat.setInput(job, dbName, inputTableName); job.setJarByClass(UseHCat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); // An HCatalog record as input job.setInputFormatClass(HCatInputFormat.class); // Mapper emits a string as key and an integer as value job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // Ignore the key for the reducer output; emitting an HCatalog record as value job.setOutputKeyClass(WritableComparable.class); job.setOutputValueClass(DefaultHCatRecord.class); job.setOutputFormatClass(HCatOutputFormat.class); HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null)); HCatSchema s = HCatOutputFormat.getTableSchema(job); System.err.println("INFO: output schema explicitly set for writing:" + s); HCatOutputFormat.setSchema(job, s); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:com.cloudera.traffic.AveragerRunner.java
License:Apache License
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = new Job(conf); job.setJarByClass(AveragerRunner.class); job.setMapperClass(AveragerMapper.class); job.setReducerClass(AveragerReducer.class); job.setCombinerClass(AveragerReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AverageWritable.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.waitForCompletion(true);/* w ww .jav a2 s . com*/ }
From source file:com.cloudy.mapred.base.JobUtil.java
License:Apache License
public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); Configuration jobConf = job.getConfiguration(); if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }/*w ww .j av a 2 s. c o m*/ job.setJarByClass(mapper); job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); job.setMapOutputKeyClass(mapperKey); job.setMapOutputValueClass(mapperValue); job.setOutputKeyClass(mapperKey); job.setOutputValueClass(mapperValue); jobConf.setBoolean("mapred.compress.map.output", true); job.setNumReduceTasks(0); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:com.cloudy.mapred.base.JobUtil.java
License:Apache License
/** * Create a map and reduce Hadoop job. Does not set the name on the job. * @param inputPath The input {@link org.apache.hadoop.fs.Path} * @param outputPath The output {@link org.apache.hadoop.fs.Path} * @param inputFormat The {@link org.apache.hadoop.mapreduce.InputFormat} * @param mapper The {@link org.apache.hadoop.mapreduce.Mapper} class to use * @param mapperKey The {@link org.apache.hadoop.io.Writable} key class. If the Mapper is a no-op, * this value may be null * @param mapperValue The {@link org.apache.hadoop.io.Writable} value class. If the Mapper is a no-op, * this value may be null * @param reducer The {@link org.apache.hadoop.mapreduce.Reducer} to use * @param reducerKey The reducer key class. * @param reducerValue The reducer value class. * @param outputFormat The {@link org.apache.hadoop.mapreduce.OutputFormat}. * @param conf The {@link org.apache.hadoop.conf.Configuration} to use. * @return The {@link org.apache.hadoop.mapreduce.Job}. * @throws IOException if there is a problem with the IO. * * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class) * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class, * org.apache.hadoop.conf.Configuration) */// w w w . j a v a 2 s .c om public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException { Job job = new Job(conf); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); } job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); if (mapperKey != null) { job.setMapOutputKeyClass(mapperKey); } if (mapperValue != null) { job.setMapOutputValueClass(mapperValue); } // jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }