Example usage for org.apache.hadoop.mapred JobConf set

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf set.

Prototype

public void set(String name, String value)

Source Link

Document

Set the value of the name property.

Usage

From source file:com.tripadvisor.hadoop.DumpDDL.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration config = getConf();
    JobConf conf = new JobConf(config, DumpDDL.class);

    GenericOptionsParser parser = new GenericOptionsParser(conf, args);

    for (String arg : args) {
        if (arg.contains("=")) {
            String vname = arg.substring(0, arg.indexOf('='));
            String vval = arg.substring(arg.indexOf('=') + 1);
            conf.set(vname, vval.replace("\"", ""));
        }//  ww w.j a  va  2 s .  c om
    }

    HiveUtil hu = new HiveUtil();
    System.out.println(hu.dumpDDL(conf.get(DB_NAME, "default"), conf.get("ignore.tables.filename")));

    return 0;
}

From source file:com.twitter.maple.hbase.mapred.TableInputFormat.java

License:Apache License

public static void setTableName(JobConf job, String tableName) {
    // Make sure that table has not been set before
    String oldTableName = getTableName(job);
    if (oldTableName != null) {
        throw new RuntimeException("table name already set to: '" + oldTableName + "'");
    }/*from   w w  w. jav  a 2s.  com*/
    job.set(INPUT_TABLE, tableName);
}

From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezExecutionEngine.java

License:Apache License

/**
 * Method to apply pig properties to JobConf
 * (replaces properties with resulting jobConf values)
 * @param conf JobConf with appropriate hadoop resource files
 * @param properties Pig properties that will override hadoop properties; properties might be modified
 *///from   w w  w  .j  a  v a  2s .  c  o  m
@SuppressWarnings("deprecation")
private void recomputeProperties(JobConf jobConf, Properties properties) {
    // We need to load the properties from the hadoop configuration
    // We want to override these with any existing properties we have.
    if (jobConf != null && properties != null) {
        // set user properties on the jobConf to ensure that defaults
        // and deprecation is applied correctly
        Enumeration<Object> propertiesIter = properties.keys();
        while (propertiesIter.hasMoreElements()) {
            String key = (String) propertiesIter.nextElement();
            String val = properties.getProperty(key);
            // We do not put user.name, See PIG-1419
            if (!key.equals("user.name"))
                jobConf.set(key, val);
        }
        //clear user defined properties and re-populate
        properties.clear();
        Iterator<Map.Entry<String, String>> iter = jobConf.iterator();
        while (iter.hasNext()) {
            Map.Entry<String, String> entry = iter.next();
            properties.put(entry.getKey(), entry.getValue());
        }
    }
}

From source file:com.uber.hoodie.common.HoodieMergeOnReadTestUtils.java

License:Apache License

private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
        Schema schema, String basePath) {
    List<Schema.Field> fields = schema.getFields();
    String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
    String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
    jobConf.set("partition_columns", "datestr");
    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
    conf.set("partition_columns", "datestr");
    inputFormat.setConf(conf);/*w  w  w  .java  2 s  .c  o  m*/
    jobConf.addResource(conf);
}

From source file:com.uber.hoodie.common.HoodieMergeOnReadTestUtils.java

License:Apache License

private static void setInputPath(JobConf jobConf, String inputPath) {
    jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
    jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
    jobConf.set("map.input.dir", inputPath);
}

From source file:com.uber.hoodie.hadoop.InputFormatTestUtil.java

License:Apache License

public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) {
    String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN,
            HoodieTestUtils.RAW_TRIPS_TEST_NAME);
    jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);

    String startCommitTimestampName = String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN,
            HoodieTestUtils.RAW_TRIPS_TEST_NAME);
    jobConf.set(startCommitTimestampName, startCommit);

    String maxCommitPulls = String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN,
            HoodieTestUtils.RAW_TRIPS_TEST_NAME);
    jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.java

License:Apache License

@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job,
        final Reporter reporter) throws IOException {

    LOG.info("Before adding Hoodie columns, Projections :"
            + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :"
            + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));

    // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table;
    // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases
    // hoodie additional projection columns are reset after calling setConf and only natural projections
    // (one found in select queries) are set. things would break because of this.
    // For e:g _hoodie_record_key would be missing and merge step would throw exceptions.
    // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction time.
    this.conf = addRequiredProjectionFields(job);

    LOG.info(//from www .  j av a  2  s  . c  o  m
            "Creating record reader with readCols :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
                    + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
    // sanity check
    Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit,
            "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split);

    // Reset the original column ids and names
    job.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS);
    job.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES);

    return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job,
            super.getRecordReader(split, job, reporter));
}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

public void testReader(boolean partitioned) throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = partitioned
            ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant)
            : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant);
    InputFormatTestUtil.commit(basePath, baseInstant);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>();
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1));
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2));
    // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change
    // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3));
    FileSlice fileSlice = new FileSlice(
            partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()),
                    new Path(partitionDir.getAbsolutePath())) : "default",
            baseInstant, "fileid0");
    logVersionsWithAction.stream().forEach(logVersionWithAction -> {
        try {/* w ww.ja va 2 s. co m*/
            // update files or generate new log file
            int logVersion = logVersionWithAction.getRight();
            String action = logVersionWithAction.getKey();
            int baseInstantTs = Integer.parseInt(baseInstant);
            String instantTime = String.valueOf(baseInstantTs + logVersion);
            String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION)
                    ? String.valueOf(baseInstantTs + logVersion - 2)
                    : instantTime;

            HoodieLogFormat.Writer writer = null;
            if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) {
                writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime,
                        String.valueOf(baseInstantTs + logVersion - 1), logVersion);
            } else {
                writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100,
                        0, logVersion);
            }
            long size = writer.getCurrentSize();
            writer.close();
            assertTrue("block - size should be > 0", size > 0);

            //create a split with baseFile (parquet file written earlier) and new log file(s)
            fileSlice.addLogFile(writer.getLogFile());
            HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
                    new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1,
                            jobConf),
                    basePath.getRoot().getPath(),
                    fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
                            .map(h -> h.getPath().toString()).collect(Collectors.toList()),
                    instantTime);

            //create a RecordReader to be used by HoodieRealtimeRecordReader
            RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
                    new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf,
                    null);
            JobConf jobConf = new JobConf();
            List<Schema.Field> fields = schema.getFields();
            String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
            String postions = fields.stream().map(f -> String.valueOf(f.pos()))
                    .collect(Collectors.joining(","));
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
            if (partitioned) {
                jobConf.set("partition_columns", "datestr");
            }

            //validate record reader compaction
            HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

            //use reader to read base Parquet File and log file, merge in flight and return latest commit
            //here all 100 records should be updated, see above
            NullWritable key = recordReader.createKey();
            ArrayWritable value = recordReader.createValue();
            while (recordReader.next(key, value)) {
                Writable[] values = value.get();
                //check if the record written is with latest commit, here "101"
                Assert.assertEquals(latestInstant, values[0].toString());
                key = recordReader.createKey();
                value = recordReader.createValue();
            }
        } catch (Exception ioe) {
            throw new HoodieException(ioe.getMessage(), ioe);
        }
    });

    // Add Rollback last version to next log-file

}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testUnMergedReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    final int firstBatchLastRecordKey = numRecords - 1;
    final int secondBatchLastRecordKey = 2 * numRecords - 1;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime);
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // insert new records to log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime,
            newCommitTime, numRecords, numRecords, 0);
    long size = writer.getCurrentSize();
    writer.close();//from  www.  j  a v  a  2 s.  c om
    assertTrue("block - size should be > 0", size > 0);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();
    String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
    String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
    jobConf.set("partition_columns", "datestr");
    // Enable merge skipping.
    jobConf.set("hoodie.realtime.merge.skip", "true");

    //validate unmerged record reader
    RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader);

    //use reader to read base Parquet File and log file
    //here all records should be present. Also ensure log records are in order.
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsAtCommit1 = 0;
    int numRecordsAtCommit2 = 0;
    Set<Integer> seenKeys = new HashSet<>();
    Integer lastSeenKeyFromLog = firstBatchLastRecordKey;
    while (recordReader.next(key, value)) {
        Writable[] values = value.get();
        String gotCommit = values[0].toString();
        String keyStr = values[2].toString();
        Integer gotKey = Integer.parseInt(keyStr.substring("key".length()));
        if (gotCommit.equals(newCommitTime)) {
            numRecordsAtCommit2++;
            Assert.assertTrue(gotKey > firstBatchLastRecordKey);
            Assert.assertTrue(gotKey <= secondBatchLastRecordKey);
            Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1);
            lastSeenKeyFromLog++;
        } else {
            numRecordsAtCommit1++;
            Assert.assertTrue(gotKey >= 0);
            Assert.assertTrue(gotKey <= firstBatchLastRecordKey);
        }
        // Ensure unique key
        Assert.assertFalse(seenKeys.contains(gotKey));
        seenKeys.add(gotKey);
        key = recordReader.createKey();
        value = recordReader.createValue();
    }
    Assert.assertEquals(numRecords, numRecordsAtCommit1);
    Assert.assertEquals(numRecords, numRecordsAtCommit2);
    Assert.assertEquals(2 * numRecords, seenKeys.size());
}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testReaderWithNestedAndComplexSchema() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    int numberOfRecords = 100;
    int numberOfLogRecords = numberOfRecords / 2;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords,
            commitTime);/* w w  w.j  a  va 2  s  . c  o m*/
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // update files or generate new log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime,
            numberOfLogRecords);
    long size = writer.getCurrentSize();
    writer.close();
    assertTrue("block - size should be > 0", size > 0);
    InputFormatTestUtil.deltaCommit(basePath, newCommitTime);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();

    String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(","));
    String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
    jobConf.set("partition_columns", "datestr");

    // validate record reader compaction
    HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

    // use reader to read base Parquet File and log file, merge in flight and return latest commit
    // here the first 50 records should be updated, see above
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsRead = 0;
    while (recordReader.next(key, value)) {
        int currentRecordNo = numRecordsRead;
        ++numRecordsRead;
        Writable[] values = value.get();
        String recordCommitTime;
        //check if the record written is with latest commit, here "101"
        if (numRecordsRead > numberOfLogRecords) {
            recordCommitTime = commitTime;
        } else {
            recordCommitTime = newCommitTime;
        }
        String recordCommitTimeSuffix = "@" + recordCommitTime;

        Assert.assertEquals(values[0].toString(), recordCommitTime);
        key = recordReader.createKey();
        value = recordReader.createValue();

        // Assert type STRING
        Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo);
        Assert.assertEquals("test value for field: field2", values[6].toString(),
                "field" + currentRecordNo + recordCommitTimeSuffix);
        Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo);

        // Assert type INT
        IntWritable intWritable = (IntWritable) values[8];
        Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(),
                currentRecordNo + recordCommitTime.hashCode());

        // Assert type LONG
        LongWritable longWritable = (LongWritable) values[9];
        Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(),
                currentRecordNo + recordCommitTime.hashCode());

        // Assert type FLOAT
        FloatWritable floatWritable = (FloatWritable) values[10];
        Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(),
                (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0);

        // Assert type DOUBLE
        DoubleWritable doubleWritable = (DoubleWritable) values[11];
        Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(),
                (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0);

        // Assert type MAP
        ArrayWritable mapItem = (ArrayWritable) values[12];
        Writable mapItemValue1 = mapItem.get()[0];
        Writable mapItemValue2 = mapItem.get()[1];

        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get()[0].toString(),
                "mapItem1");
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get()[0].toString(),
                "mapItem2");
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get().length, 2);
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get().length, 2);
        Writable mapItemValue1value = ((ArrayWritable) mapItemValue1).get()[1];
        Writable mapItemValue2value = ((ArrayWritable) mapItemValue2).get()[1];
        Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1",
                ((ArrayWritable) mapItemValue1value).get()[0].toString(), "item" + currentRecordNo);
        Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1",
                ((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo);
        Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2",
                ((ArrayWritable) mapItemValue1value).get()[1].toString(),
                "item" + currentRecordNo + recordCommitTimeSuffix);
        Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2",
                ((ArrayWritable) mapItemValue2value).get()[1].toString(),
                "item2" + currentRecordNo + recordCommitTimeSuffix);

        // Assert type RECORD
        ArrayWritable recordItem = (ArrayWritable) values[13];
        Writable[] nestedRecord = recordItem.get();
        Assert.assertEquals("test value for field: testNestedRecord.isAdmin",
                ((BooleanWritable) nestedRecord[0]).get(), false);
        Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(),
                "UserId" + currentRecordNo + recordCommitTimeSuffix);

        // Assert type ARRAY
        ArrayWritable arrayValue = (ArrayWritable) values[14];
        Writable[] arrayValues = arrayValue.get();
        for (int i = 0; i < arrayValues.length; i++) {
            Assert.assertEquals("test value for field: stringArray", "stringArray" + i + recordCommitTimeSuffix,
                    arrayValues[i].toString());
        }
    }
}