Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.DataSetInputFormat.java

License:Apache License

public static void setInput(Job job, String inputDatasetName) {
    job.setInputFormatClass(DataSetInputFormat.class);
    job.getConfiguration().set(DataSetInputFormat.HCONF_ATTR_INPUT_DATASET, inputDatasetName);
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 *
 * @param job The {@link Job}/* www  .j av a 2  s.  c  o  m*/
 * @param namedInput name of the input
 * @param inputFormatClass the name of the InputFormat class to be used for this input
 * @param inputConfigs the configurations to be used for this input
 * @param mapperClass {@link Mapper} class to use for this path
 */
@SuppressWarnings("unchecked")
public static void addInput(Job job, String namedInput, String inputFormatClass,
        Map<String, String> inputConfigs, Class<? extends Mapper> mapperClass) {
    Configuration conf = job.getConfiguration();

    Map<String, MapperInput> map = getInputMap(conf);
    // this shouldn't happen, because it is already protected against in BasicMapReduceContext#addInput
    if (map.containsKey(namedInput)) {
        throw new IllegalArgumentException("Input already configured: " + namedInput);
    }
    map.put(namedInput, new MapperInput(inputFormatClass, inputConfigs, mapperClass));
    conf.set(INPUT_CONFIGS, GSON.toJson(map));

    job.setInputFormatClass(DelegatingInputFormat.class);
}

From source file:co.cask.cdap.template.etl.batch.source.DBSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) {
    LOG.debug("pluginType = {}; pluginName = {}; connectionString = {}; importQuery = {}; " + "countQuery = {}",
            dbSourceConfig.jdbcPluginType, dbSourceConfig.jdbcPluginName, dbSourceConfig.connectionString,
            dbSourceConfig.importQuery, dbSourceConfig.countQuery);

    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    // Load the plugin class to make sure it is available.
    Class<? extends Driver> driverClass = context.loadPluginClass(getJDBCPluginId());
    if (dbSourceConfig.user == null && dbSourceConfig.password == null) {
        DBConfiguration.configureDB(hConf, driverClass.getName(), dbSourceConfig.connectionString);
    } else {//from   www .  jav a 2s. c om
        DBConfiguration.configureDB(hConf, driverClass.getName(), dbSourceConfig.connectionString,
                dbSourceConfig.user, dbSourceConfig.password);
    }
    ETLDBInputFormat.setInput(job, DBRecord.class, dbSourceConfig.importQuery, dbSourceConfig.countQuery);
    job.setInputFormatClass(ETLDBInputFormat.class);
}

From source file:co.cask.cdap.template.etl.batch.source.FileBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    //SimpleDateFormat needs to be local because it is not threadsafe
    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH");

    //calculate date one hour ago, rounded down to the nearest hour
    prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1));
    Calendar cal = Calendar.getInstance();
    cal.setTime(prevHour);// w ww  .  jav a 2s .co m
    cal.set(Calendar.MINUTE, 0);
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    prevHour = cal.getTime();

    Job job = context.getHadoopJob();
    Configuration conf = job.getConfiguration();
    if (config.fileSystemProperties != null) {
        Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE);
        for (Map.Entry<String, String> entry : properties.entrySet()) {
            conf.set(entry.getKey(), entry.getValue());
        }
    }

    if (config.fileRegex != null) {
        conf.set(INPUT_REGEX_CONFIG, config.fileRegex);
    }
    conf.set(INPUT_NAME_CONFIG, config.path);

    if (config.timeTable != null) {
        table = context.getDataset(config.timeTable);
        datesToRead = Bytes.toString(table.read(LAST_TIME_READ));
        if (datesToRead == null) {
            List<Date> firstRun = Lists.newArrayList(new Date(0));
            datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE);
        }
        List<Date> attempted = Lists.newArrayList(prevHour);
        String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE);
        if (!updatedDatesToRead.equals(datesToRead)) {
            table.write(LAST_TIME_READ, updatedDatesToRead);
        }
        conf.set(LAST_TIME_READ, datesToRead);
    }

    conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour));
    if (!Strings.isNullOrEmpty(config.inputFormatClass)) {
        ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
        Class<? extends FileInputFormat> classType = (Class<? extends FileInputFormat>) classLoader
                .loadClass(config.inputFormatClass);
        job.setInputFormatClass(classType);
    } else {
        job.setInputFormatClass(CombineTextInputFormat.class);
    }
    FileInputFormat.setInputPathFilter(job, BatchFileFilter.class);
    FileInputFormat.addInputPath(job, new Path(config.path));
    long maxSplitSize;
    try {
        maxSplitSize = Long.parseLong(config.maxSplitSize);
    } catch (NumberFormatException e) {
        maxSplitSize = DEFAULT_SPLIT_SIZE;
    }
    CombineTextInputFormat.setMaxInputSplitSize(job, maxSplitSize);
}

From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java

License:Apache License

/**
 * Note that the "orderBy" column is called the "splitBy" in this version.
 * We reuse the same field, but it's not strictly ordering it -- just
 * partitioning the results./*from w ww  .  j a  v a2s  .  c  o  m*/
 */
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String tableName,
        String conditions, String splitBy, String... fieldNames) {
    DBInputFormat.setInput(job, inputClass, tableName, conditions, splitBy, fieldNames);
    job.setInputFormatClass(DataDrivenDBInputFormat.class);
}

From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java

License:Apache License

/**
 * setInput() takes a custom query and a separate "bounding query" to use
 * instead of the custom "count query" used by DBInputFormat.
 *//* w  w w  .  j a  v  a 2s .c  om*/
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery,
        String inputBoundingQuery) {
    DBInputFormat.setInput(job, inputClass, inputQuery, "");
    job.getConfiguration().set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery);
    job.setInputFormatClass(DataDrivenDBInputFormat.class);
}

From source file:co.nubetech.apache.hadoop.DBInputFormat.java

License:Apache License

/**
 * Initializes the map-part of the job with the appropriate input settings.
 * /*from  w ww.j ava2 s  .  c  o  m*/
 * @param job
 *            The map-reduce job
 * @param inputClass
 *            the class object implementing DBWritable, which is the Java
 *            object holding tuple fields.
 * @param tableName
 *            The table to read data from
 * @param conditions
 *            The condition which to select data with, eg. '(updated >
 *            20070101 AND length > 0)'
 * @param orderBy
 *            the fieldNames in the orderBy clause.
 * @param fieldNames
 *            The field names in the table
 * @see #setInput(Job, Class, String, String)
 */
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String tableName,
        String conditions, String orderBy, String... fieldNames) {
    job.setInputFormatClass(DBInputFormat.class);
    DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
    dbConf.setInputClass(inputClass);
    dbConf.setInputTableName(tableName);
    dbConf.setInputFieldNames(fieldNames);
    dbConf.setInputConditions(conditions);
    dbConf.setInputOrderBy(orderBy);
}

From source file:co.nubetech.apache.hadoop.DBInputFormat.java

License:Apache License

/**
 * Initializes the map-part of the job with the appropriate input settings.
 * //w ww  . j  a v a  2  s  .  co  m
 * @param job
 *            The map-reduce job
 * @param inputClass
 *            the class object implementing DBWritable, which is the Java
 *            object holding tuple fields.
 * @param inputQuery
 *            the input query to select fields. Example :
 *            "SELECT f1, f2, f3 FROM Mytable ORDER BY f1"
 * @param inputCountQuery
 *            the input query that returns the number of records in the
 *            table. Example : "SELECT COUNT(f1) FROM Mytable"
 * @see #setInput(Job, Class, String, String, String, String...)
 */
public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery,
        String inputCountQuery) {
    job.setInputFormatClass(DBInputFormat.class);
    DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
    dbConf.setInputClass(inputClass);
    dbConf.setInputQuery(inputQuery);
    dbConf.setInputCountQuery(inputCountQuery);
}

From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java

License:Apache License

/**
 * Note that the "orderBy" column is called the "splitBy" in this version.
 * We reuse the same field, but it's not strictly ordering it -- just
 * partitioning the results./*  w  ww  .jav  a 2  s  .c  om*/
 */
public static void setInput(Job job, String tableName, String conditions, String splitBy, ArrayList params,
        String... fieldNames) throws IOException {
    DBInputFormat.setInput(job, GenericDBWritable.class, tableName, conditions, splitBy, fieldNames);
    if (params != null) {
        DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(job.getConfiguration(),
                ArrayList.class);
        job.getConfiguration().set(HIHOConf.QUERY_PARAMS, stringifier.toString(params));
        logger.debug("Converted params and saved them into config");
    }
    job.setInputFormatClass(DBQueryInputFormat.class);
}

From source file:co.nubetech.hiho.dedup.DedupJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    populateConfiguration(args);/*from  ww  w . j a v  a  2 s  .  c  om*/
    try {
        checkMandatoryConfs();
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new Exception(e1);
    }
    Job job = new Job(conf);
    job.setJobName("Dedup job");
    job.setJarByClass(DedupJob.class);

    Class inputFormatClass = Class.forName(inputFormat);
    Class outputFormatClass = Class.forName(outputFormat);
    Class inputKeyClass = Class.forName(inputKeyClassName);
    Class inputValueClass = Class.forName(inputValueClassName);

    if (dedupBy.equals("key")) {
        job.setMapperClass(DedupKeyMapper.class);
        job.setReducerClass(DedupKeyReducer.class);
        job.setMapOutputValueClass(inputValueClass);
    } else if (dedupBy.equals("value")) {
        job.setMapperClass(DedupValueMapper.class);
        job.setReducerClass(DedupValueReducer.class);
        job.setMapOutputValueClass(inputKeyClass);
    }

    job.setInputFormatClass(inputFormatClass);
    if (inputFormat.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) {
        DelimitedTextInputFormat.setProperties(job, delimiter, column);
    }

    job.setMapOutputKeyClass(HihoTuple.class);

    job.setOutputKeyClass(inputKeyClass);
    job.setOutputValueClass(inputValueClass);
    job.setPartitionerClass(HihoHashPartitioner.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    try {
        logger.debug("Output format class is " + job.getOutputFormatClass());
        logger.debug("Class is " + ReflectionUtils
                .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
        job.waitForCompletion(false);
        if (job.isComplete()) {
            Counters counters = job.getCounters();
            totalRecordsRead = counters.findCounter(DedupRecordCounter.TOTAL_RECORDS_READ).getValue();
            badRecords = counters.findCounter(DedupRecordCounter.BAD_RECORD).getValue();
            output = counters.findCounter(DedupRecordCounter.OUTPUT).getValue();
            duplicateRecords = totalRecordsRead - output;
            logger.info("Total records read are: " + totalRecordsRead);
            logger.info("Bad Records are: " + badRecords);
            logger.info("Output records are: " + output);
            logger.info("Duplicate records are: " + duplicateRecords);
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
    return 0;
}