List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.DataSetInputFormat.java
License:Apache License
public static void setInput(Job job, String inputDatasetName) { job.setInputFormatClass(DataSetInputFormat.class); job.getConfiguration().set(DataSetInputFormat.HCONF_ATTR_INPUT_DATASET, inputDatasetName); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} and * {@link Mapper} to the list of inputs for the map-reduce job. * * @param job The {@link Job}/* www .j av a 2 s. c o m*/ * @param namedInput name of the input * @param inputFormatClass the name of the InputFormat class to be used for this input * @param inputConfigs the configurations to be used for this input * @param mapperClass {@link Mapper} class to use for this path */ @SuppressWarnings("unchecked") public static void addInput(Job job, String namedInput, String inputFormatClass, Map<String, String> inputConfigs, Class<? extends Mapper> mapperClass) { Configuration conf = job.getConfiguration(); Map<String, MapperInput> map = getInputMap(conf); // this shouldn't happen, because it is already protected against in BasicMapReduceContext#addInput if (map.containsKey(namedInput)) { throw new IllegalArgumentException("Input already configured: " + namedInput); } map.put(namedInput, new MapperInput(inputFormatClass, inputConfigs, mapperClass)); conf.set(INPUT_CONFIGS, GSON.toJson(map)); job.setInputFormatClass(DelegatingInputFormat.class); }
From source file:co.cask.cdap.template.etl.batch.source.DBSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) { LOG.debug("pluginType = {}; pluginName = {}; connectionString = {}; importQuery = {}; " + "countQuery = {}", dbSourceConfig.jdbcPluginType, dbSourceConfig.jdbcPluginName, dbSourceConfig.connectionString, dbSourceConfig.importQuery, dbSourceConfig.countQuery); Job job = context.getHadoopJob(); Configuration hConf = job.getConfiguration(); // Load the plugin class to make sure it is available. Class<? extends Driver> driverClass = context.loadPluginClass(getJDBCPluginId()); if (dbSourceConfig.user == null && dbSourceConfig.password == null) { DBConfiguration.configureDB(hConf, driverClass.getName(), dbSourceConfig.connectionString); } else {//from www . jav a 2s. c om DBConfiguration.configureDB(hConf, driverClass.getName(), dbSourceConfig.connectionString, dbSourceConfig.user, dbSourceConfig.password); } ETLDBInputFormat.setInput(job, DBRecord.class, dbSourceConfig.importQuery, dbSourceConfig.countQuery); job.setInputFormatClass(ETLDBInputFormat.class); }
From source file:co.cask.cdap.template.etl.batch.source.FileBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { //SimpleDateFormat needs to be local because it is not threadsafe SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH"); //calculate date one hour ago, rounded down to the nearest hour prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1)); Calendar cal = Calendar.getInstance(); cal.setTime(prevHour);// w ww . jav a 2s .co m cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); prevHour = cal.getTime(); Job job = context.getHadoopJob(); Configuration conf = job.getConfiguration(); if (config.fileSystemProperties != null) { Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE); for (Map.Entry<String, String> entry : properties.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } } if (config.fileRegex != null) { conf.set(INPUT_REGEX_CONFIG, config.fileRegex); } conf.set(INPUT_NAME_CONFIG, config.path); if (config.timeTable != null) { table = context.getDataset(config.timeTable); datesToRead = Bytes.toString(table.read(LAST_TIME_READ)); if (datesToRead == null) { List<Date> firstRun = Lists.newArrayList(new Date(0)); datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE); } List<Date> attempted = Lists.newArrayList(prevHour); String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE); if (!updatedDatesToRead.equals(datesToRead)) { table.write(LAST_TIME_READ, updatedDatesToRead); } conf.set(LAST_TIME_READ, datesToRead); } conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour)); if (!Strings.isNullOrEmpty(config.inputFormatClass)) { ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); Class<? extends FileInputFormat> classType = (Class<? extends FileInputFormat>) classLoader .loadClass(config.inputFormatClass); job.setInputFormatClass(classType); } else { job.setInputFormatClass(CombineTextInputFormat.class); } FileInputFormat.setInputPathFilter(job, BatchFileFilter.class); FileInputFormat.addInputPath(job, new Path(config.path)); long maxSplitSize; try { maxSplitSize = Long.parseLong(config.maxSplitSize); } catch (NumberFormatException e) { maxSplitSize = DEFAULT_SPLIT_SIZE; } CombineTextInputFormat.setMaxInputSplitSize(job, maxSplitSize); }
From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java
License:Apache License
/** * Note that the "orderBy" column is called the "splitBy" in this version. * We reuse the same field, but it's not strictly ordering it -- just * partitioning the results./*from w ww . j a v a2s . c o m*/ */ public static void setInput(Job job, Class<? extends DBWritable> inputClass, String tableName, String conditions, String splitBy, String... fieldNames) { DBInputFormat.setInput(job, inputClass, tableName, conditions, splitBy, fieldNames); job.setInputFormatClass(DataDrivenDBInputFormat.class); }
From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java
License:Apache License
/** * setInput() takes a custom query and a separate "bounding query" to use * instead of the custom "count query" used by DBInputFormat. *//* w w w . j a v a 2s .c om*/ public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery, String inputBoundingQuery) { DBInputFormat.setInput(job, inputClass, inputQuery, ""); job.getConfiguration().set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery); job.setInputFormatClass(DataDrivenDBInputFormat.class); }
From source file:co.nubetech.apache.hadoop.DBInputFormat.java
License:Apache License
/** * Initializes the map-part of the job with the appropriate input settings. * /*from w ww.j ava2 s . c o m*/ * @param job * The map-reduce job * @param inputClass * the class object implementing DBWritable, which is the Java * object holding tuple fields. * @param tableName * The table to read data from * @param conditions * The condition which to select data with, eg. '(updated > * 20070101 AND length > 0)' * @param orderBy * the fieldNames in the orderBy clause. * @param fieldNames * The field names in the table * @see #setInput(Job, Class, String, String) */ public static void setInput(Job job, Class<? extends DBWritable> inputClass, String tableName, String conditions, String orderBy, String... fieldNames) { job.setInputFormatClass(DBInputFormat.class); DBConfiguration dbConf = new DBConfiguration(job.getConfiguration()); dbConf.setInputClass(inputClass); dbConf.setInputTableName(tableName); dbConf.setInputFieldNames(fieldNames); dbConf.setInputConditions(conditions); dbConf.setInputOrderBy(orderBy); }
From source file:co.nubetech.apache.hadoop.DBInputFormat.java
License:Apache License
/** * Initializes the map-part of the job with the appropriate input settings. * //w ww . j a v a 2 s . co m * @param job * The map-reduce job * @param inputClass * the class object implementing DBWritable, which is the Java * object holding tuple fields. * @param inputQuery * the input query to select fields. Example : * "SELECT f1, f2, f3 FROM Mytable ORDER BY f1" * @param inputCountQuery * the input query that returns the number of records in the * table. Example : "SELECT COUNT(f1) FROM Mytable" * @see #setInput(Job, Class, String, String, String, String...) */ public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery, String inputCountQuery) { job.setInputFormatClass(DBInputFormat.class); DBConfiguration dbConf = new DBConfiguration(job.getConfiguration()); dbConf.setInputClass(inputClass); dbConf.setInputQuery(inputQuery); dbConf.setInputCountQuery(inputCountQuery); }
From source file:co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.java
License:Apache License
/** * Note that the "orderBy" column is called the "splitBy" in this version. * We reuse the same field, but it's not strictly ordering it -- just * partitioning the results./* w ww .jav a 2 s .c om*/ */ public static void setInput(Job job, String tableName, String conditions, String splitBy, ArrayList params, String... fieldNames) throws IOException { DBInputFormat.setInput(job, GenericDBWritable.class, tableName, conditions, splitBy, fieldNames); if (params != null) { DefaultStringifier<ArrayList> stringifier = new DefaultStringifier<ArrayList>(job.getConfiguration(), ArrayList.class); job.getConfiguration().set(HIHOConf.QUERY_PARAMS, stringifier.toString(params)); logger.debug("Converted params and saved them into config"); } job.setInputFormatClass(DBQueryInputFormat.class); }
From source file:co.nubetech.hiho.dedup.DedupJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); populateConfiguration(args);/*from ww w . j a v a 2 s . c om*/ try { checkMandatoryConfs(); } catch (HIHOException e1) { e1.printStackTrace(); throw new Exception(e1); } Job job = new Job(conf); job.setJobName("Dedup job"); job.setJarByClass(DedupJob.class); Class inputFormatClass = Class.forName(inputFormat); Class outputFormatClass = Class.forName(outputFormat); Class inputKeyClass = Class.forName(inputKeyClassName); Class inputValueClass = Class.forName(inputValueClassName); if (dedupBy.equals("key")) { job.setMapperClass(DedupKeyMapper.class); job.setReducerClass(DedupKeyReducer.class); job.setMapOutputValueClass(inputValueClass); } else if (dedupBy.equals("value")) { job.setMapperClass(DedupValueMapper.class); job.setReducerClass(DedupValueReducer.class); job.setMapOutputValueClass(inputKeyClass); } job.setInputFormatClass(inputFormatClass); if (inputFormat.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) { DelimitedTextInputFormat.setProperties(job, delimiter, column); } job.setMapOutputKeyClass(HihoTuple.class); job.setOutputKeyClass(inputKeyClass); job.setOutputValueClass(inputValueClass); job.setPartitionerClass(HihoHashPartitioner.class); FileInputFormat.setInputPaths(job, inputPath); job.setOutputFormatClass(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(outputPath)); try { logger.debug("Output format class is " + job.getOutputFormatClass()); logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName()); job.waitForCompletion(false); if (job.isComplete()) { Counters counters = job.getCounters(); totalRecordsRead = counters.findCounter(DedupRecordCounter.TOTAL_RECORDS_READ).getValue(); badRecords = counters.findCounter(DedupRecordCounter.BAD_RECORD).getValue(); output = counters.findCounter(DedupRecordCounter.OUTPUT).getValue(); duplicateRecords = totalRecordsRead - output; logger.info("Total records read are: " + totalRecordsRead); logger.info("Bad Records are: " + badRecords); logger.info("Output records are: " + output); logger.info("Duplicate records are: " + duplicateRecords); } } catch (Exception e) { e.printStackTrace(); } return 0; }