List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:co.cask.hydrator.plugin.batch.CopybookInputFormat.java
License:Apache License
public static void setBinaryFilePath(Job job, String binaryFile) { job.getConfiguration().set(COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH, binaryFile); }
From source file:co.cask.hydrator.plugin.batch.CopybookSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws IOException { Job job = JobUtils.createInstance(); CopybookInputFormat.setCopybookInputformatCblContents(job, config.copybookContents); CopybookInputFormat.setBinaryFilePath(job, config.binaryFilePath); // Set the input file path for the job CopybookInputFormat.setInputPaths(job, config.binaryFilePath); CopybookInputFormat.setMaxInputSplitSize(job, config.maxSplitSize); context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(CopybookInputFormat.class, job.getConfiguration()))); }
From source file:co.cask.hydrator.plugin.batch.sink.HiveBatchSink.java
License:Apache License
@Override public void prepareRun(BatchSinkContext context) throws Exception { Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); HiveSinkOutputFormatProvider sinkOutputFormatProvider = new HiveSinkOutputFormatProvider(job, config); HCatSchema hiveSchema = sinkOutputFormatProvider.getHiveSchema(); HiveSchemaStore.storeHiveSchema(context, config.dbName, config.tableName, hiveSchema); context.addOutput(Output.of(config.referenceName, sinkOutputFormatProvider).alias(config.tableName)); }
From source file:co.cask.hydrator.plugin.batch.source.ExcelInputFormat.java
License:Apache License
public static void setConfigurations(Job job, String filePattern, String sheetName, boolean reprocess, int sheetNo, String columnList, boolean skipFirstRow, String terminateIfEmptyRow, String rowLimit, String ifErrorRecord, String processedFiles) { Configuration configuration = job.getConfiguration(); configuration.set(FILE_PATTERN, filePattern); configuration.set(SHEET_NAME, sheetName); configuration.setBoolean(RE_PROCESS, reprocess); configuration.setInt(SHEET_NO, sheetNo); configuration.set(COLUMN_LIST, columnList); configuration.setBoolean(SKIP_FIRST_ROW, skipFirstRow); configuration.set(TERMINATE_IF_EMPTY_ROW, terminateIfEmptyRow); if (!Strings.isNullOrEmpty(rowLimit)) { configuration.set(ROWS_LIMIT, rowLimit); }/*ww w . j a va 2 s .co m*/ configuration.set(IF_ERROR_RECORD, ifErrorRecord); configuration.set(PROCESSED_FILES, processedFiles); }
From source file:co.cask.hydrator.plugin.batch.source.ExcelInputReader.java
License:Apache License
@Override public void prepareRun(BatchSourceContext batchSourceContext) throws Exception { Job job = JobUtils.createInstance(); String processFiles = GSON.toJson(getAllProcessedFiles(batchSourceContext), ARRAYLIST_PREPROCESSED_FILES); ExcelInputFormat.setConfigurations(job, excelInputreaderConfig.filePattern, excelInputreaderConfig.sheetName, excelInputreaderConfig.reprocess, excelInputreaderConfig.sheetNo, excelInputreaderConfig.columnList, excelInputreaderConfig.skipFirstRow, excelInputreaderConfig.terminateIfEmptyRow, excelInputreaderConfig.rowsLimit, excelInputreaderConfig.ifErrorRecord, processFiles); // Sets the input path(s). ExcelInputFormat.addInputPaths(job, excelInputreaderConfig.filePath); // Sets the filter based on extended class implementation. ExcelInputFormat.setInputPathFilter(job, ExcelReaderRegexFilter.class); SourceInputFormatProvider inputFormatProvider = new SourceInputFormatProvider(ExcelInputFormat.class, job.getConfiguration()); batchSourceContext.setInput(Input.of(excelInputreaderConfig.referenceName, inputFormatProvider)); }
From source file:co.cask.hydrator.plugin.batch.source.FileBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { //SimpleDateFormat needs to be local because it is not threadsafe SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH"); //calculate date one hour ago, rounded down to the nearest hour prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1)); Calendar cal = Calendar.getInstance(); cal.setTime(prevHour);// w ww . j av a2s .c o m cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); prevHour = cal.getTime(); Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE); //noinspection ConstantConditions for (Map.Entry<String, String> entry : properties.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } conf.set(INPUT_REGEX_CONFIG, config.fileRegex); conf.set(INPUT_NAME_CONFIG, config.path); if (config.timeTable != null) { table = context.getDataset(config.timeTable); datesToRead = Bytes.toString(table.read(LAST_TIME_READ)); if (datesToRead == null) { List<Date> firstRun = Lists.newArrayList(new Date(0)); datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE); } List<Date> attempted = Lists.newArrayList(prevHour); String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE); if (!updatedDatesToRead.equals(datesToRead)) { table.write(LAST_TIME_READ, updatedDatesToRead); } conf.set(LAST_TIME_READ, datesToRead); } conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour)); FileInputFormat.setInputPathFilter(job, BatchFileFilter.class); FileInputFormat.addInputPath(job, new Path(config.path)); if (config.maxSplitSize != null) { FileInputFormat.setMaxInputSplitSize(job, config.maxSplitSize); } context.setInput( Input.of(config.referenceName, new SourceInputFormatProvider(config.inputFormatClass, conf))); }
From source file:co.cask.hydrator.plugin.batch.source.HiveBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { // This line is to load VersionInfo class here to make it available in the HCatInputFormat.setInput call. This is // needed to support CDAP 3.2 where we were just exposing the classes of the plugin jar and not the resources. LOG.trace("Hadoop version: {}", VersionInfo.getVersion()); Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); conf.set(HiveConf.ConfVars.METASTOREURIS.varname, config.metaStoreURI); if (UserGroupInformation.isSecurityEnabled()) { conf.set(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.varname, "true"); conf.set("hive.metastore.token.signature", HiveAuthFactory.HS2_CLIENT_TOKEN); }/*from w w w .jav a 2s .com*/ // Use the current thread's classloader to ensure that when setInput is called it can access VersionInfo class // loaded above. This is needed to support CDAP 3.2 where we were just exposing classes to plugin jars and not // resources. ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); HCatInputFormat.setInput(conf, config.dbName, config.tableName, config.partitions); } finally { Thread.currentThread().setContextClassLoader(classLoader); } HCatSchema hCatSchema = HCatInputFormat.getTableSchema(conf); if (config.schema != null) { // if the user provided a schema then we should use that schema to read the table. This will allow user to // drop non-primitive types and read the table. hCatSchema = HiveSchemaConverter.toHiveSchema(Schema.parseJson(config.schema), hCatSchema); HCatInputFormat.setOutputSchema(job, hCatSchema); } HiveSchemaStore.storeHiveSchema(context, config.dbName, config.tableName, hCatSchema); context.setInput( Input.of(config.referenceName, new SourceInputFormatProvider(HCatInputFormat.class, conf))); }
From source file:co.cask.hydrator.plugin.batch.source.TimePartitionedFileSetDatasetAvroSource.java
License:Apache License
@Override protected void addInputFormatConfiguration(Map<String, String> config) { try {//from w ww .j a v a 2 s. c o m Job job = JobUtils.createInstance(); Configuration hConf = job.getConfiguration(); Schema avroSchema = new Schema.Parser().parse(tpfsAvroConfig.schema); AvroJob.setInputKeySchema(job, avroSchema); for (Map.Entry<String, String> entry : hConf) { config.put(entry.getKey(), entry.getValue()); } } catch (IOException e) { // Shouldn't happen throw Throwables.propagate(e); } }
From source file:co.cask.hydrator.plugin.batch.source.TimePartitionedFileSetDatasetParquetSource.java
License:Apache License
@Override protected void addInputFormatConfiguration(Map<String, String> config) { try {/* w w w. j a v a2 s . c om*/ Job job = JobUtils.createInstance(); Configuration hConf = job.getConfiguration(); Schema avroSchema = new Schema.Parser().parse(tpfsParquetConfig.schema.toLowerCase()); AvroParquetInputFormat.setAvroReadSchema(job, avroSchema); for (Map.Entry<String, String> entry : hConf) { config.put(entry.getKey(), entry.getValue()); } } catch (IOException e) { // Shouldn't happen throw Throwables.propagate(e); } }
From source file:co.cask.hydrator.plugin.batch.source.XMLReaderBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); conf.set(XMLInputFormat.XML_INPUTFORMAT_PATH_NAME, config.path); conf.set(XMLInputFormat.XML_INPUTFORMAT_NODE_PATH, config.nodePath); if (StringUtils.isNotEmpty(config.pattern)) { conf.set(XMLInputFormat.XML_INPUTFORMAT_PATTERN, config.pattern); }/*from ww w . j a v a2 s . c o m*/ conf.set(XMLInputFormat.XML_INPUTFORMAT_FILE_ACTION, config.actionAfterProcess); if (StringUtils.isNotEmpty(config.targetFolder)) { conf.set(XMLInputFormat.XML_INPUTFORMAT_TARGET_FOLDER, config.targetFolder); } setFileTrackingInfo(context, conf); //Create a temporary directory, in which XMLRecordReader will add file tracking information. fileSystem = FileSystem.get(conf); long startTime = context.getLogicalStartTime(); //Create temp file name using start time to make it unique. String tempDirectory = config.tableName + startTime; tempDirectoryPath = new Path(config.temporaryFolder, tempDirectory); fileSystem.mkdirs(tempDirectoryPath); fileSystem.deleteOnExit(tempDirectoryPath); conf.set(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_DATA_TEMP_FOLDER, tempDirectoryPath.toUri().toString()); XMLInputFormat.setInputPathFilter(job, BatchXMLFileFilter.class); XMLInputFormat.addInputPath(job, new Path(config.path)); context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(XMLInputFormat.class, conf))); }