Example usage for org.apache.hadoop.mapreduce Job getConfiguration

List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration() 

Source Link

Document

Return the configuration for the job.

Usage

From source file:co.cask.hydrator.plugin.batch.CopybookInputFormat.java

License:Apache License

public static void setBinaryFilePath(Job job, String binaryFile) {
    job.getConfiguration().set(COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH, binaryFile);
}

From source file:co.cask.hydrator.plugin.batch.CopybookSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws IOException {
    Job job = JobUtils.createInstance();
    CopybookInputFormat.setCopybookInputformatCblContents(job, config.copybookContents);
    CopybookInputFormat.setBinaryFilePath(job, config.binaryFilePath);
    // Set the input file path for the job
    CopybookInputFormat.setInputPaths(job, config.binaryFilePath);
    CopybookInputFormat.setMaxInputSplitSize(job, config.maxSplitSize);
    context.setInput(Input.of(config.referenceName,
            new SourceInputFormatProvider(CopybookInputFormat.class, job.getConfiguration())));
}

From source file:co.cask.hydrator.plugin.batch.sink.HiveBatchSink.java

License:Apache License

@Override
public void prepareRun(BatchSinkContext context) throws Exception {
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();

    HiveSinkOutputFormatProvider sinkOutputFormatProvider = new HiveSinkOutputFormatProvider(job, config);
    HCatSchema hiveSchema = sinkOutputFormatProvider.getHiveSchema();
    HiveSchemaStore.storeHiveSchema(context, config.dbName, config.tableName, hiveSchema);
    context.addOutput(Output.of(config.referenceName, sinkOutputFormatProvider).alias(config.tableName));
}

From source file:co.cask.hydrator.plugin.batch.source.ExcelInputFormat.java

License:Apache License

public static void setConfigurations(Job job, String filePattern, String sheetName, boolean reprocess,
        int sheetNo, String columnList, boolean skipFirstRow, String terminateIfEmptyRow, String rowLimit,
        String ifErrorRecord, String processedFiles) {

    Configuration configuration = job.getConfiguration();
    configuration.set(FILE_PATTERN, filePattern);
    configuration.set(SHEET_NAME, sheetName);
    configuration.setBoolean(RE_PROCESS, reprocess);
    configuration.setInt(SHEET_NO, sheetNo);
    configuration.set(COLUMN_LIST, columnList);
    configuration.setBoolean(SKIP_FIRST_ROW, skipFirstRow);
    configuration.set(TERMINATE_IF_EMPTY_ROW, terminateIfEmptyRow);

    if (!Strings.isNullOrEmpty(rowLimit)) {
        configuration.set(ROWS_LIMIT, rowLimit);
    }/*ww w .  j  a  va 2  s  .co m*/

    configuration.set(IF_ERROR_RECORD, ifErrorRecord);
    configuration.set(PROCESSED_FILES, processedFiles);
}

From source file:co.cask.hydrator.plugin.batch.source.ExcelInputReader.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext batchSourceContext) throws Exception {

    Job job = JobUtils.createInstance();

    String processFiles = GSON.toJson(getAllProcessedFiles(batchSourceContext), ARRAYLIST_PREPROCESSED_FILES);

    ExcelInputFormat.setConfigurations(job, excelInputreaderConfig.filePattern,
            excelInputreaderConfig.sheetName, excelInputreaderConfig.reprocess, excelInputreaderConfig.sheetNo,
            excelInputreaderConfig.columnList, excelInputreaderConfig.skipFirstRow,
            excelInputreaderConfig.terminateIfEmptyRow, excelInputreaderConfig.rowsLimit,
            excelInputreaderConfig.ifErrorRecord, processFiles);

    // Sets the input path(s).
    ExcelInputFormat.addInputPaths(job, excelInputreaderConfig.filePath);

    // Sets the filter based on extended class implementation.
    ExcelInputFormat.setInputPathFilter(job, ExcelReaderRegexFilter.class);
    SourceInputFormatProvider inputFormatProvider = new SourceInputFormatProvider(ExcelInputFormat.class,
            job.getConfiguration());
    batchSourceContext.setInput(Input.of(excelInputreaderConfig.referenceName, inputFormatProvider));

}

From source file:co.cask.hydrator.plugin.batch.source.FileBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    //SimpleDateFormat needs to be local because it is not threadsafe
    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH");

    //calculate date one hour ago, rounded down to the nearest hour
    prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1));
    Calendar cal = Calendar.getInstance();
    cal.setTime(prevHour);// w ww  . j av  a2s  .c o  m
    cal.set(Calendar.MINUTE, 0);
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    prevHour = cal.getTime();

    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();

    Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE);
    //noinspection ConstantConditions
    for (Map.Entry<String, String> entry : properties.entrySet()) {
        conf.set(entry.getKey(), entry.getValue());
    }

    conf.set(INPUT_REGEX_CONFIG, config.fileRegex);
    conf.set(INPUT_NAME_CONFIG, config.path);

    if (config.timeTable != null) {
        table = context.getDataset(config.timeTable);
        datesToRead = Bytes.toString(table.read(LAST_TIME_READ));
        if (datesToRead == null) {
            List<Date> firstRun = Lists.newArrayList(new Date(0));
            datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE);
        }
        List<Date> attempted = Lists.newArrayList(prevHour);
        String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE);
        if (!updatedDatesToRead.equals(datesToRead)) {
            table.write(LAST_TIME_READ, updatedDatesToRead);
        }
        conf.set(LAST_TIME_READ, datesToRead);
    }

    conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour));
    FileInputFormat.setInputPathFilter(job, BatchFileFilter.class);
    FileInputFormat.addInputPath(job, new Path(config.path));
    if (config.maxSplitSize != null) {
        FileInputFormat.setMaxInputSplitSize(job, config.maxSplitSize);
    }
    context.setInput(
            Input.of(config.referenceName, new SourceInputFormatProvider(config.inputFormatClass, conf)));
}

From source file:co.cask.hydrator.plugin.batch.source.HiveBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    // This line is to load VersionInfo class here to make it available in the HCatInputFormat.setInput call. This is
    // needed to support CDAP 3.2 where we were just exposing the classes of the plugin jar and not the resources.
    LOG.trace("Hadoop version: {}", VersionInfo.getVersion());
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();

    conf.set(HiveConf.ConfVars.METASTOREURIS.varname, config.metaStoreURI);

    if (UserGroupInformation.isSecurityEnabled()) {
        conf.set(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.varname, "true");
        conf.set("hive.metastore.token.signature", HiveAuthFactory.HS2_CLIENT_TOKEN);
    }/*from  w  w w .jav  a 2s .com*/
    // Use the current thread's classloader to ensure that when setInput is called it can access VersionInfo class
    // loaded above. This is needed to support CDAP 3.2 where we were just exposing classes to plugin jars and not
    // resources.
    ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
    try {
        Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
        HCatInputFormat.setInput(conf, config.dbName, config.tableName, config.partitions);
    } finally {
        Thread.currentThread().setContextClassLoader(classLoader);
    }

    HCatSchema hCatSchema = HCatInputFormat.getTableSchema(conf);
    if (config.schema != null) {
        // if the user provided a schema then we should use that schema to read the table. This will allow user to
        // drop non-primitive types and read the table.
        hCatSchema = HiveSchemaConverter.toHiveSchema(Schema.parseJson(config.schema), hCatSchema);
        HCatInputFormat.setOutputSchema(job, hCatSchema);
    }
    HiveSchemaStore.storeHiveSchema(context, config.dbName, config.tableName, hCatSchema);
    context.setInput(
            Input.of(config.referenceName, new SourceInputFormatProvider(HCatInputFormat.class, conf)));
}

From source file:co.cask.hydrator.plugin.batch.source.TimePartitionedFileSetDatasetAvroSource.java

License:Apache License

@Override
protected void addInputFormatConfiguration(Map<String, String> config) {
    try {//from w ww  .j a  v a  2  s. c  o m
        Job job = JobUtils.createInstance();
        Configuration hConf = job.getConfiguration();

        Schema avroSchema = new Schema.Parser().parse(tpfsAvroConfig.schema);
        AvroJob.setInputKeySchema(job, avroSchema);
        for (Map.Entry<String, String> entry : hConf) {
            config.put(entry.getKey(), entry.getValue());
        }
    } catch (IOException e) {
        // Shouldn't happen
        throw Throwables.propagate(e);
    }
}

From source file:co.cask.hydrator.plugin.batch.source.TimePartitionedFileSetDatasetParquetSource.java

License:Apache License

@Override
protected void addInputFormatConfiguration(Map<String, String> config) {
    try {/*  w w w. j a v a2 s .  c om*/
        Job job = JobUtils.createInstance();
        Configuration hConf = job.getConfiguration();

        Schema avroSchema = new Schema.Parser().parse(tpfsParquetConfig.schema.toLowerCase());
        AvroParquetInputFormat.setAvroReadSchema(job, avroSchema);
        for (Map.Entry<String, String> entry : hConf) {
            config.put(entry.getKey(), entry.getValue());
        }
    } catch (IOException e) {
        // Shouldn't happen
        throw Throwables.propagate(e);
    }
}

From source file:co.cask.hydrator.plugin.batch.source.XMLReaderBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();
    conf.set(XMLInputFormat.XML_INPUTFORMAT_PATH_NAME, config.path);
    conf.set(XMLInputFormat.XML_INPUTFORMAT_NODE_PATH, config.nodePath);
    if (StringUtils.isNotEmpty(config.pattern)) {
        conf.set(XMLInputFormat.XML_INPUTFORMAT_PATTERN, config.pattern);
    }/*from  ww w . j  a  v a2 s .  c o m*/
    conf.set(XMLInputFormat.XML_INPUTFORMAT_FILE_ACTION, config.actionAfterProcess);
    if (StringUtils.isNotEmpty(config.targetFolder)) {
        conf.set(XMLInputFormat.XML_INPUTFORMAT_TARGET_FOLDER, config.targetFolder);
    }

    setFileTrackingInfo(context, conf);

    //Create a temporary directory, in which XMLRecordReader will add file tracking information.
    fileSystem = FileSystem.get(conf);
    long startTime = context.getLogicalStartTime();
    //Create temp file name using start time to make it unique.
    String tempDirectory = config.tableName + startTime;
    tempDirectoryPath = new Path(config.temporaryFolder, tempDirectory);
    fileSystem.mkdirs(tempDirectoryPath);
    fileSystem.deleteOnExit(tempDirectoryPath);
    conf.set(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_DATA_TEMP_FOLDER, tempDirectoryPath.toUri().toString());

    XMLInputFormat.setInputPathFilter(job, BatchXMLFileFilter.class);
    XMLInputFormat.addInputPath(job, new Path(config.path));
    context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(XMLInputFormat.class, conf)));
}