Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.cloudera.recordservice.pig.HCatRSLoader.java

License:Apache License

@Override
public void setLocation(String location, Job job) throws IOException {
    HCatContext.INSTANCE.setConf(job.getConfiguration()).getConf().get()
            .setBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, true);
    UDFContext udfContext = UDFContext.getUDFContext();
    Properties udfProps = udfContext.getUDFProperties(this.getClass(), new String[] { signature });
    job.getConfiguration().set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + signature);

    RequiredFieldList requiredFieldsInfo = (RequiredFieldList) udfProps.get(PRUNE_PROJECTION_INFO);
    // get partitionFilterString stored in the UDFContext - it would have
    // been stored there by an earlier call to setPartitionFilter
    // call setInput on HCatInputFormat only in the frontend because internally
    // it makes calls to the hcat server - we don't want these to happen in
    // the backend
    // in the hadoop front end mapred.task.id property will not be set in
    // the Configuration
    if (udfProps.containsKey(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET)) {
        for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements();) {
            PigHCatUtil.getConfigFromUDFProperties(udfProps, job.getConfiguration(),
                    emr.nextElement().toString());
        }//from   w w  w  .j  a  v a 2  s  . c o m
        if (!HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            //Combine credentials and credentials from job takes precedence for freshness
            Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + signature);
            job.getCredentials().addAll(crd);
        }
    } else {
        Job clone = new Job(job.getConfiguration());
        HCatRSInputFormat.setInput(job, location, getPartitionFilterString());
        InputJobInfo inputJobInfo = (InputJobInfo) HCatRSUtil
                .deserialize(job.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO));

        // TODO: Add back special cases call when I find out where the code has moved.
        addSpecialCasesParametersForHCatLoader(job.getConfiguration(), inputJobInfo.getTableInfo());

        // We will store all the new /changed properties in the job in the
        // udf context, so the the HCatInputFormat.setInput method need not
        //be called many times.
        for (Entry<String, String> keyValue : job.getConfiguration()) {
            String oldValue = clone.getConfiguration().getRaw(keyValue.getKey());
            if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) {
                udfProps.put(keyValue.getKey(), keyValue.getValue());
            }
        }
        udfProps.put(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET, true);
        //Store credentials in a private hash map and not the udf context to
        // make sure they are not public.
        Credentials crd = new Credentials();
        crd.addAll(job.getCredentials());
        jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + signature, crd);
        clone.setInputFormatClass(HCatRSInputFormat.class);
    }

    // Need to also push projections by calling setOutputSchema on
    // HCatInputFormat - we have to get the RequiredFields information
    // from the UdfContext, translate it to an Schema and then pass it
    // The reason we do this here is because setLocation() is called by
    // Pig runtime at InputFormat.getSplits() and
    // InputFormat.createRecordReader() time - we are not sure when
    // HCatInputFormat needs to know about pruned projections - so doing it
    // here will ensure we communicate to HCatInputFormat about pruned
    // projections at getSplits() and createRecordReader() time

    if (requiredFieldsInfo != null) {
        // convert to hcatschema and pass to HCatInputFormat
        try {
            outputSchema = phutil.getHCatSchema(requiredFieldsInfo.getFields(), signature, this.getClass());
            HCatRSInputFormat.setOutputSchema(job, outputSchema);
        } catch (Exception e) {
            throw new IOException(e);
        }
    } else {
        // else - this means pig's optimizer never invoked the pushProjection
        // method - so we need all fields and hence we should not call the
        // setOutputSchema on HCatInputFormat
        if (HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            try {
                HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA);
                outputSchema = hcatTableSchema;
                HCatRSInputFormat.setOutputSchema(job, outputSchema);
            } catch (Exception e) {
                throw new IOException(e);
            }
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("outputSchema=" + outputSchema);
    }
    job.setInputFormatClass(HCatRSInputFormat.class);
}

From source file:com.cloudera.sa.hbasebulkload.HBASEBulkLoadDriver.java

@Override
public int run(String[] args) throws Exception {
    Configuration config = getConf();
    args = new GenericOptionsParser(config, args).getRemainingArgs();

    if (args.length < 6) {
        /*System.out.println("hadoop jar HBASEBulkLoad.jar "
         + "com.cloudera.sa.hbasebulkload.HBASEBulkLoadDriver"
         + " <inputpath> <outputpath> <hbaseTable> <hbaseColumnFamily"
         + " \"<hbaseColumns (delimiter seperated)>\" <column delimiter>");*/
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }/*  ww w  .  j  ava 2s . c  om*/

    String hbaseTab = args[2];
    String hbaseColumnFamily = args[3];
    String hbaseColumns = args[4];
    String hbaseColumnSeperator = args[5];
    config.set(HBASEBulkLoadConstants.HBASE_TABLE_KEY, hbaseTab.trim().toLowerCase(Locale.ENGLISH));
    config.set(HBASEBulkLoadConstants.HBASE_COLUMN_FAMILY_KEY, hbaseColumnFamily);
    config.set(HBASEBulkLoadConstants.HBASE_COLUMNS_KEY, hbaseColumns.trim().toLowerCase(Locale.ENGLISH));
    config.set(HBASEBulkLoadConstants.HBASE_COLUMN_SEPERATOR_KEY, hbaseColumnSeperator);
    System.out.println(2);
    Job job = Job.getInstance(config, this.getClass().getName() + "-" + hbaseTab);
    HBaseConfiguration.addHbaseResources(config);

    job.setInputFormatClass(TextInputFormat.class);
    job.setJarByClass(HBASEBulkLoadDriver.class);
    job.setMapperClass(HBASEBulkLoadKeyValueMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(Put.class);
    job.setCombinerClass(PutCombiner.class);
    job.setReducerClass(PutSortReducer.class);

    Connection connection = ConnectionFactory.createConnection(config);
    Table hTab = connection.getTable(TableName.valueOf(hbaseTab));

    FileSystem.get(getConf()).delete(new Path(args[1]), true);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //job.setOutputFormatClass(HFileOutputFormat2.class);
    TableMapReduceUtil.initTableReducerJob(hTab.getName().getNameAsString(), null, job);
    //job.setNumReduceTasks(0);
    TableMapReduceUtil.addDependencyJars(job);
    HFileOutputFormat2.configureIncrementalLoadMap(job, hTab);

    int exitCode = job.waitForCompletion(true) ? HBASEBulkLoadConstants.SUCCESS
            : HBASEBulkLoadConstants.FAILURE;
    System.out.println(8);
    if (HBASEBulkLoadConstants.SUCCESS == exitCode) {
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
        loader.doBulkLoad(new Path(args[1]), (HTable) hTab);
        connection.close();
    }
    return exitCode;
}

From source file:com.cloudera.sa.securewordcount.SecureWordCountDriver.java

@Override
public int run(String[] args) throws Exception {
    Configuration config = getConf();
    args = new GenericOptionsParser(config, args).getRemainingArgs();

    if (args.length < 2) {

        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }/*from w w w.  j av a 2 s  .  c  o m*/
    Job job = Job.getInstance(config, this.getClass().getName() + "-wordcount");
    job.setJarByClass(SecureWordCountDriver.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:com.cloudera.science.ml.hcatalog.HCatalogSource.java

License:Open Source License

@Override
public void configureSource(Job job, int inputId) throws IOException {
    if (inputId == -1) {
        HCatInputFormat.setInput(job, info);
        job.setInputFormatClass(HCatInputFormat.class);
    } else {//from   ww w.  j  a v a 2 s  .  c o  m
        FormatBundle<HCatInputFormat> bundle = FormatBundle.forInput(HCatInputFormat.class);
        bundle.set(HCatConstants.HCAT_KEY_JOB_INFO, HCatUtil.serialize(info));
        CrunchInputs.addInputPath(job, new Path(info.getTableName()), bundle, inputId);
    }
}

From source file:com.cloudera.sqoop.mapreduce.MergeJob.java

License:Apache License

public boolean runMergeJob() throws IOException {
    Configuration conf = options.getConf();
    Job job = new Job(conf);

    String userClassName = options.getClassName();
    if (null == userClassName) {
        // Shouldn't get here.
        throw new IOException("Record class name not specified with " + "--class-name.");
    }/*w  w w . j a v a2  s. c  om*/

    // Set the external jar to use for the job.
    String existingJar = options.getExistingJarName();
    if (existingJar != null) {
        // User explicitly identified a jar path.
        LOG.debug("Setting job jar to user-specified jar: " + existingJar);
        job.getConfiguration().set("mapred.jar", existingJar);
    } else {
        // Infer it from the location of the specified class, if it's on the
        // classpath.
        try {
            Class<? extends Object> userClass = conf.getClassByName(userClassName);
            if (null != userClass) {
                String userJar = Jars.getJarPathForClass(userClass);
                LOG.debug("Setting job jar based on user class " + userClassName + ": " + userJar);
                job.getConfiguration().set("mapred.jar", userJar);
            } else {
                LOG.warn("Specified class " + userClassName + " is not in a jar. "
                        + "MapReduce may not find the class");
            }
        } catch (ClassNotFoundException cnfe) {
            throw new IOException(cnfe);
        }
    }

    try {
        Path oldPath = new Path(options.getMergeOldPath());
        Path newPath = new Path(options.getMergeNewPath());

        Configuration jobConf = job.getConfiguration();
        FileSystem fs = FileSystem.get(jobConf);
        oldPath = oldPath.makeQualified(fs);
        newPath = newPath.makeQualified(fs);

        FileInputFormat.addInputPath(job, oldPath);
        FileInputFormat.addInputPath(job, newPath);

        jobConf.set(MERGE_OLD_PATH_KEY, oldPath.toString());
        jobConf.set(MERGE_NEW_PATH_KEY, newPath.toString());
        jobConf.set(MERGE_KEY_COL_KEY, options.getMergeKeyCol());
        jobConf.set(MERGE_SQOOP_RECORD_KEY, userClassName);

        FileOutputFormat.setOutputPath(job, new Path(options.getTargetDir()));

        if (ExportJobBase.isSequenceFiles(jobConf, newPath)) {
            job.setInputFormatClass(SequenceFileInputFormat.class);
            job.setOutputFormatClass(SequenceFileOutputFormat.class);
            job.setMapperClass(MergeRecordMapper.class);
        } else {
            job.setMapperClass(MergeTextMapper.class);
            job.setOutputFormatClass(RawKeyTextOutputFormat.class);
        }

        jobConf.set("mapred.output.key.class", userClassName);
        job.setOutputValueClass(NullWritable.class);

        job.setReducerClass(MergeReducer.class);

        // Set the intermediate data types.
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(MergeRecord.class);

        // Make sure Sqoop and anything else we need is on the classpath.
        cacheJars(job, null);
        return this.runJob(job);
    } catch (InterruptedException ie) {
        throw new IOException(ie);
    } catch (ClassNotFoundException cnfe) {
        throw new IOException(cnfe);
    }
}

From source file:com.cloudera.sqoop.mapreduce.MySQLDumpImportJob.java

License:Apache License

/**
 * Configure the inputformat to use for the job.
 */// www.  j a  v  a2s  .c om
protected void configureInputFormat(Job job, String tableName, String tableClassName, String splitByCol)
        throws ClassNotFoundException, IOException {

    if (null == tableName) {
        LOG.error("mysqldump-based import cannot support free-form query imports.");
        LOG.error("Do not use --direct and --query together for MySQL.");
        throw new IOException("null tableName for MySQLDumpImportJob.");
    }

    ConnManager mgr = getContext().getConnManager();
    String username = options.getUsername();
    if (null == username || username.length() == 0) {
        DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString());
    } else {
        DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString(),
                username, options.getPassword());
    }

    String[] colNames = options.getColumns();
    if (null == colNames) {
        colNames = mgr.getColumnNames(tableName);
    }

    String[] sqlColNames = null;
    if (null != colNames) {
        sqlColNames = new String[colNames.length];
        for (int i = 0; i < colNames.length; i++) {
            sqlColNames[i] = mgr.escapeColName(colNames[i]);
        }
    }

    // It's ok if the where clause is null in DBInputFormat.setInput.
    String whereClause = options.getWhereClause();

    // We can't set the class properly in here, because we may not have the
    // jar loaded in this JVM. So we start by calling setInput() with
    // DBWritable and then overriding the string manually.

    // Note that mysqldump also does *not* want a quoted table name.
    DataDrivenDBInputFormat.setInput(job, DBWritable.class, tableName, whereClause,
            mgr.escapeColName(splitByCol), sqlColNames);

    Configuration conf = job.getConfiguration();
    conf.setInt(MySQLUtils.OUTPUT_FIELD_DELIM_KEY, options.getOutputFieldDelim());
    conf.setInt(MySQLUtils.OUTPUT_RECORD_DELIM_KEY, options.getOutputRecordDelim());
    conf.setInt(MySQLUtils.OUTPUT_ENCLOSED_BY_KEY, options.getOutputEnclosedBy());
    conf.setInt(MySQLUtils.OUTPUT_ESCAPED_BY_KEY, options.getOutputEscapedBy());
    conf.setBoolean(MySQLUtils.OUTPUT_ENCLOSE_REQUIRED_KEY, options.isOutputEncloseRequired());
    String[] extraArgs = options.getExtraArgs();
    if (null != extraArgs) {
        conf.setStrings(MySQLUtils.EXTRA_ARGS_KEY, extraArgs);
    }

    LOG.debug("Using InputFormat: " + inputFormatClass);
    job.setInputFormatClass(getInputFormatClass());
}

From source file:com.cloudera.test.UseHCat.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    // Get the input and output table names as arguments
    String inputTableName = args[0];
    String outputTableName = args[1];
    // Assume the default database
    String dbName = null;/*w w  w  .j ava  2 s.co  m*/

    Job job = new Job(conf, "UseHCat");
    HCatInputFormat.setInput(job, dbName, inputTableName);
    job.setJarByClass(UseHCat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    // An HCatalog record as input
    job.setInputFormatClass(HCatInputFormat.class);

    // Mapper emits a string as key and an integer as value
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // Ignore the key for the reducer output; emitting an HCatalog record as value
    job.setOutputKeyClass(WritableComparable.class);
    job.setOutputValueClass(DefaultHCatRecord.class);
    job.setOutputFormatClass(HCatOutputFormat.class);

    HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
    HCatSchema s = HCatOutputFormat.getTableSchema(job);
    System.err.println("INFO: output schema explicitly set for writing:" + s);
    HCatOutputFormat.setSchema(job, s);
    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.cloudera.traffic.AveragerRunner.java

License:Apache License

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    Job job = new Job(conf);
    job.setJarByClass(AveragerRunner.class);
    job.setMapperClass(AveragerMapper.class);
    job.setReducerClass(AveragerReducer.class);
    job.setCombinerClass(AveragerReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(AverageWritable.class);
    job.setInputFormatClass(TextInputFormat.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    job.waitForCompletion(true);/*  w ww  .jav a2 s  .  com*/
}

From source file:com.cloudy.mapred.base.JobUtil.java

License:Apache License

public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, Configuration conf)
        throws IOException {

    Job job = new Job(new Configuration(conf));
    Configuration jobConf = job.getConfiguration();

    if (mapper.equals(Mapper.class)) {
        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
    }/*w  ww .j  av  a  2  s. c o  m*/
    job.setJarByClass(mapper);

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    job.setMapOutputKeyClass(mapperKey);
    job.setMapOutputValueClass(mapperValue);
    job.setOutputKeyClass(mapperKey);
    job.setOutputValueClass(mapperValue);
    jobConf.setBoolean("mapred.compress.map.output", true);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}

From source file:com.cloudy.mapred.base.JobUtil.java

License:Apache License

/**
 * Create a map and reduce Hadoop job.  Does not set the name on the job.
 * @param inputPath The input {@link org.apache.hadoop.fs.Path}
 * @param outputPath The output {@link org.apache.hadoop.fs.Path}
 * @param inputFormat The {@link org.apache.hadoop.mapreduce.InputFormat}
 * @param mapper The {@link org.apache.hadoop.mapreduce.Mapper} class to use
 * @param mapperKey The {@link org.apache.hadoop.io.Writable} key class.  If the Mapper is a no-op,
 *                  this value may be null
 * @param mapperValue The {@link org.apache.hadoop.io.Writable} value class.  If the Mapper is a no-op,
 *                    this value may be null
 * @param reducer The {@link org.apache.hadoop.mapreduce.Reducer} to use
 * @param reducerKey The reducer key class.
 * @param reducerValue The reducer value class.
 * @param outputFormat The {@link org.apache.hadoop.mapreduce.OutputFormat}.
 * @param conf The {@link org.apache.hadoop.conf.Configuration} to use.
 * @return The {@link org.apache.hadoop.mapreduce.Job}.
 * @throws IOException if there is a problem with the IO.
 *
 * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class)
 * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class,
 * org.apache.hadoop.conf.Configuration)
 *///  w w  w .  j  a  v a  2  s  .c  om
public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
        Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
        Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException {

    Job job = new Job(conf);
    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }
        job.setJarByClass(mapper);
    } else {
        job.setJarByClass(reducer);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    if (mapperKey != null) {
        job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
        job.setMapOutputValueClass(mapperValue);
    }

    //    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setReducerClass(reducer);
    job.setOutputKeyClass(reducerKey);
    job.setOutputValueClass(reducerValue);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}