Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.DataSetInputFormat.java

License:Apache License

public static void setInput(Job job, String inputDatasetName) {
    job.setInputFormatClass(DataSetInputFormat.class);
    job.getConfiguration().set(DataSetInputFormat.HCONF_ATTR_INPUT_DATASET, inputDatasetName);
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 *
 * @param job The {@link Job}/* www  .j av a 2  s.  c  o  m*/
 * @param namedInput name of the input
 * @param inputFormatClass the name of the InputFormat class to be used for this input
 * @param inputConfigs the configurations to be used for this input
 * @param mapperClass {@link Mapper} class to use for this path
 */
@SuppressWarnings("unchecked")
public static void addInput(Job job, String namedInput, String inputFormatClass,
        Map<String, String> inputConfigs, Class<? extends Mapper> mapperClass) {
    Configuration conf = job.getConfiguration();

    Map<String, MapperInput> map = getInputMap(conf);
    // this shouldn't happen, because it is already protected against in BasicMapReduceContext#addInput
    if (map.containsKey(namedInput)) {
        throw new IllegalArgumentException("Input already configured: " + namedInput);
    }
    map.put(namedInput, new MapperInput(inputFormatClass, inputConfigs, mapperClass));
    conf.set(INPUT_CONFIGS, GSON.toJson(map));

    job.setInputFormatClass(DelegatingInputFormat.class);
}

From source file:co.cask.cdap.template.etl.batch.source.DBSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) {
    LOG.debug("pluginType = {}; pluginName = {}; connectionString = {}; importQuery = {}; " + "countQuery = {}",
            dbSourceConfig.jdbcPluginType, dbSourceConfig.jdbcPluginName, dbSourceConfig.connectionString,
            dbSourceConfig.importQuery, dbSourceConfig.countQuery);

    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    // Load the plugin class to make sure it is available.
    Class<? extends Driver> driverClass = context.loadPluginClass(getJDBCPluginId());
    if (dbSourceConfig.user == null && dbSourceConfig.password == null) {
        DBConfiguration.configureDB(hConf, driverClass.getName(), dbSourceConfig.connectionString);
    } else {//from   www .  jav a 2s. c om
        DBConfiguration.configureDB(hConf, driverClass.getName(), dbSourceConfig.connectionString,
                dbSourceConfig.user, dbSourceConfig.password);
    }
    ETLDBInputFormat.setInput(job, DBRecord.class, dbSourceConfig.importQuery, dbSourceConfig.countQuery);
    job.setInputFormatClass(ETLDBInputFormat.class);
}

From source file:co.cask.cdap.template.etl.batch.source.FileBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    //SimpleDateFormat needs to be local because it is not threadsafe
    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH");

    //calculate date one hour ago, rounded down to the nearest hour
    prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1));
    Calendar cal = Calendar.getInstance();
    cal.setTime(prevHour);// w ww  .  jav a 2s .co m
    cal.set(Calendar.MINUTE, 0);
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    prevHour = cal.getTime();

    Job job = context.getHadoopJob();
    Configuration conf = job.getConfiguration();
    if (config.fileSystemProperties != null) {
        Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE);
        for (Map.Entry<String, String> entry : properties.entrySet()) {
            conf.set(entry.getKey(), entry.getValue());
        }
    }

    if (config.fileRegex != null) {
        conf.set(INPUT_REGEX_CONFIG, config.fileRegex);
    }
    conf.set(INPUT_NAME_CONFIG, config.path);

    if (config.timeTable != null) {
        table = context.getDataset(config.timeTable);
        datesToRead = Bytes.toString(table.read(LAST_TIME_READ));
        if (datesToRead == null) {
            List<Date> firstRun = Lists.newArrayList(new Date(0));
            datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE);
        }
        List<Date> attempted = Lists.newArrayList(prevHour);
        String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE);
        if (!updatedDatesToRead.equals(datesToRead)) {
            table.write(LAST_TIME_READ, updatedDatesToRead);
        }
        conf.set(LAST_TIME_READ, datesToRead);
    }

    conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour));
    if (!Strings.isNullOrEmpty(config.inputFormatClass)) {
        ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
        Class<? extends FileInputFormat> classType = (Class<? extends FileInputFormat>) classLoader
                .loadClass(config.inputFormatClass);
        job.setInputFormatClass(classType);
    } else {
        job.setInputFormatClass(CombineTextInputFormat.class);
    }
    FileInputFormat.setInputPathFilter(job, BatchFileFilter.class);
    FileInputFormat.addInputPath(job, new Path(config.path));
    long maxSplitSize;
    try {
        maxSplitSize = Long.parseLong(config.maxSplitSize);
    } catch (NumberFormatException e) {
        maxSplitSize = DEFAULT_SPLIT_SIZE;
    }
    CombineTextInputFormat.setMaxInputSplitSize(job, maxSplitSize);
}

From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java

License:Apache License

/**
 * Note that the "orderBy" column is called the "splitBy" in this version.
 * We reuse the same field, but it's not strictly ordering it -- just
 * partitioning the results./*from w ww  .  j a  v a2s  .  c  o  m*/
 */
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String tableName,
        String conditions, String splitBy, String... fieldNames) {
    DBInputFormat.setInput(job, inputClass, tableName, conditions, splitBy, fieldNames);
    job.setInputFormatClass(DataDrivenDBInputFormat.class);
}

From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java

License:Apache License

/**
 * setInput() takes a custom query and a separate "bounding query" to use
 * instead of the custom "count query" used by DBInputFormat.
 *//* w  w w  .  j a  v  a 2s .c  om*/
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery,
        String inputBoundingQuery) {
    DBInputFormat.setInput(job, inputClass, inputQuery, "");
    job.getConfiguration().set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery);
    job.setInputFormatClass(DataDrivenDBInputFormat.class);
}

From source file:co.nubetech.apache.hadoop.DBInputFormat.java

License:Apache License

/**
 * Initializes the map-part of the job with the appropriate input settings.
 * /*from  w ww.j ava2 s  .  c  o  m*/
 * @param job
 *            The map-reduce job
 * @param inputClass
 *            the class object implementing DBWritable, which is the Java
 *            object holding tuple fields.
 * @param tableName
 *            The table to read data from
 * @param conditions
 *            The condition which to select data with, eg. '(updated >
 *            20070101 AND length > 0)'
 * @param orderBy
 *            the fieldNames in the orderBy clause.
 * @param fieldNames
 *            The field names in the table
 * @see #setInput(Job, Class, String, String)
 */
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String tableName,
        String conditions, String orderBy, String... fieldNames) {
    job.setInputFormatClass(DBInputFormat.class);
    DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
    dbConf.setInputClass(inputClass);
    dbConf.setInputTableName(tableName);
    dbConf.setInputFieldNames(fieldNames);
    dbConf.setInputConditions(conditions);
    dbConf.setInputOrderBy(orderBy);
}

From source file:co.nubetech.apache.hadoop.DBInputFormat.java

License:Apache License

/**
 * Initializes the map-part of the job with the appropriate input settings.
 * //w ww  . j  a v a  2  s  .  co  m
 * @param job
 *            The map-reduce job
 * @param inputClass
 *            the class object implementing DBWritable, which is the Java
 *            object holding tuple fields.
 * @param inputQuery
 *            the input query to select fields. Example :
 *            "SELECT f1, f2, f3 FROM Mytable ORDER BY f1"
 * @param inputCountQuery
 *            the input query that returns the number of records in the
 *            table. Example : "SELECT COUNT(f1) FROM Mytable"
 * @see #setInput(Job, Class, String, String, String, String...)
 */
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery,
        String inputCountQuery) {
    job.setInputFormatClass(DBInputFormat.class);
    DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
    dbConf.setInputClass(inputClass);
    dbConf.setInputQuery(inputQuery);
    dbConf.setInputCountQuery(inputCountQuery);
}

From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java

License:Apache License

/**
 * Note that the "orderBy" column is called the "splitBy" in this version.
 * We reuse the same field, but it's not strictly ordering it -- just
 * partitioning the results./*  w  ww  .jav  a 2  s  .c  om*/
 */
public static void setInput(Job job, String tableName, String conditions, String splitBy, ArrayList params,
        String... fieldNames) throws IOException {
    DBInputFormat.setInput(job, GenericDBWritable.class, tableName, conditions, splitBy, fieldNames);
    if (params != null) {
        DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(job.getConfiguration(),
                ArrayList.class);
        job.getConfiguration().set(HIHOConf.QUERY_PARAMS, stringifier.toString(params));
        logger.debug("Converted params and saved them into config");
    }
    job.setInputFormatClass(DBQueryInputFormat.class);
}

From source file:co.nubetech.hiho.dedup.DedupJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    populateConfiguration(args);/*from  ww  w . j a v  a  2 s  .  c  om*/
    try {
        checkMandatoryConfs();
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new Exception(e1);
    }
    Job job = new Job(conf);
    job.setJobName("Dedup job");
    job.setJarByClass(DedupJob.class);

    Class inputFormatClass = Class.forName(inputFormat);
    Class outputFormatClass = Class.forName(outputFormat);
    Class inputKeyClass = Class.forName(inputKeyClassName);
    Class inputValueClass = Class.forName(inputValueClassName);

    if (dedupBy.equals("key")) {
        job.setMapperClass(DedupKeyMapper.class);
        job.setReducerClass(DedupKeyReducer.class);
        job.setMapOutputValueClass(inputValueClass);
    } else if (dedupBy.equals("value")) {
        job.setMapperClass(DedupValueMapper.class);
        job.setReducerClass(DedupValueReducer.class);
        job.setMapOutputValueClass(inputKeyClass);
    }

    job.setInputFormatClass(inputFormatClass);
    if (inputFormat.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) {
        DelimitedTextInputFormat.setProperties(job, delimiter, column);
    }

    job.setMapOutputKeyClass(HihoTuple.class);

    job.setOutputKeyClass(inputKeyClass);
    job.setOutputValueClass(inputValueClass);
    job.setPartitionerClass(HihoHashPartitioner.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    try {
        logger.debug("Output format class is " + job.getOutputFormatClass());
        logger.debug("Class is " + ReflectionUtils
                .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
        job.waitForCompletion(false);
        if (job.isComplete()) {
            Counters counters = job.getCounters();
            totalRecordsRead = counters.findCounter(DedupRecordCounter.TOTAL_RECORDS_READ).getValue();
            badRecords = counters.findCounter(DedupRecordCounter.BAD_RECORD).getValue();
            output = counters.findCounter(DedupRecordCounter.OUTPUT).getValue();
            duplicateRecords = totalRecordsRead - output;
            logger.info("Total records read are: " + totalRecordsRead);
            logger.info("Bad Records are: " + badRecords);
            logger.info("Output records are: " + output);
            logger.info("Duplicate records are: " + duplicateRecords);
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
    return 0;
}