List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:com.tripadvisor.hadoop.DumpDDL.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration config = getConf(); JobConf conf = new JobConf(config, DumpDDL.class); GenericOptionsParser parser = new GenericOptionsParser(conf, args); for (String arg : args) { if (arg.contains("=")) { String vname = arg.substring(0, arg.indexOf('=')); String vval = arg.substring(arg.indexOf('=') + 1); conf.set(vname, vval.replace("\"", "")); }// ww w.j a va 2 s . c om } HiveUtil hu = new HiveUtil(); System.out.println(hu.dumpDDL(conf.get(DB_NAME, "default"), conf.get("ignore.tables.filename"))); return 0; }
From source file:com.twitter.maple.hbase.mapred.TableInputFormat.java
License:Apache License
public static void setTableName(JobConf job, String tableName) { // Make sure that table has not been set before String oldTableName = getTableName(job); if (oldTableName != null) { throw new RuntimeException("table name already set to: '" + oldTableName + "'"); }/*from w w w. jav a 2s. com*/ job.set(INPUT_TABLE, tableName); }
From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezExecutionEngine.java
License:Apache License
/** * Method to apply pig properties to JobConf * (replaces properties with resulting jobConf values) * @param conf JobConf with appropriate hadoop resource files * @param properties Pig properties that will override hadoop properties; properties might be modified *///from w w w .j a v a 2s . c o m @SuppressWarnings("deprecation") private void recomputeProperties(JobConf jobConf, Properties properties) { // We need to load the properties from the hadoop configuration // We want to override these with any existing properties we have. if (jobConf != null && properties != null) { // set user properties on the jobConf to ensure that defaults // and deprecation is applied correctly Enumeration<Object> propertiesIter = properties.keys(); while (propertiesIter.hasMoreElements()) { String key = (String) propertiesIter.nextElement(); String val = properties.getProperty(key); // We do not put user.name, See PIG-1419 if (!key.equals("user.name")) jobConf.set(key, val); } //clear user defined properties and re-populate properties.clear(); Iterator<Map.Entry<String, String>> iter = jobConf.iterator(); while (iter.hasNext()) { Map.Entry<String, String> entry = iter.next(); properties.put(entry.getKey(), entry.getValue()); } } }
From source file:com.uber.hoodie.common.HoodieMergeOnReadTestUtils.java
License:Apache License
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema, String basePath) { List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set("partition_columns", "datestr"); conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); conf.set("partition_columns", "datestr"); inputFormat.setConf(conf);/*w w w .java 2 s .c o m*/ jobConf.addResource(conf); }
From source file:com.uber.hoodie.common.HoodieMergeOnReadTestUtils.java
License:Apache License
private static void setInputPath(JobConf jobConf, String inputPath) { jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath); jobConf.set("map.input.dir", inputPath); }
From source file:com.uber.hoodie.hadoop.InputFormatTestUtil.java
License:Apache License
public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE); String startCommitTimestampName = String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(startCommitTimestampName, startCommit); String maxCommitPulls = String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.java
License:Apache License
@Override public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { LOG.info("Before adding Hoodie columns, Projections :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table; // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases // hoodie additional projection columns are reset after calling setConf and only natural projections // (one found in select queries) are set. things would break because of this. // For e:g _hoodie_record_key would be missing and merge step would throw exceptions. // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction time. this.conf = addRequiredProjectionFields(job); LOG.info(//from www . j av a 2 s . c o m "Creating record reader with readCols :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // sanity check Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit, "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split); // Reset the original column ids and names job.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS); job.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES); return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job, super.getRecordReader(split, job, reporter)); }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
public void testReader(boolean partitioned) throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String baseInstant = "100"; File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant) : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant); InputFormatTestUtil.commit(basePath, baseInstant); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>(); logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1)); logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2)); // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3)); FileSlice fileSlice = new FileSlice( partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()), new Path(partitionDir.getAbsolutePath())) : "default", baseInstant, "fileid0"); logVersionsWithAction.stream().forEach(logVersionWithAction -> { try {/* w ww.ja va 2 s. co m*/ // update files or generate new log file int logVersion = logVersionWithAction.getRight(); String action = logVersionWithAction.getKey(); int baseInstantTs = Integer.parseInt(baseInstant); String instantTime = String.valueOf(baseInstantTs + logVersion); String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION) ? String.valueOf(baseInstantTs + logVersion - 2) : instantTime; HoodieLogFormat.Writer writer = null; if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) { writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime, String.valueOf(baseInstantTs + logVersion - 1), logVersion); } else { writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100, 0, logVersion); } long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); //create a split with baseFile (parquet file written earlier) and new log file(s) fileSlice.addLogFile(writer.getLogFile()); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) .map(h -> h.getPath().toString()).collect(Collectors.toList()), instantTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())) .collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); if (partitioned) { jobConf.set("partition_columns", "datestr"); } //validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); //use reader to read base Parquet File and log file, merge in flight and return latest commit //here all 100 records should be updated, see above NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); while (recordReader.next(key, value)) { Writable[] values = value.get(); //check if the record written is with latest commit, here "101" Assert.assertEquals(latestInstant, values[0].toString()); key = recordReader.createKey(); value = recordReader.createValue(); } } catch (Exception ioe) { throw new HoodieException(ioe.getMessage(), ioe); } }); // Add Rollback last version to next log-file }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testUnMergedReader() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; final int numRecords = 1000; final int firstBatchLastRecordKey = numRecords - 1; final int secondBatchLastRecordKey = 2 * numRecords - 1; File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime); InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // insert new records to log file String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numRecords, numRecords, 0); long size = writer.getCurrentSize(); writer.close();//from www. j a v a 2 s. c om assertTrue("block - size should be > 0", size > 0); //create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set("partition_columns", "datestr"); // Enable merge skipping. jobConf.set("hoodie.realtime.merge.skip", "true"); //validate unmerged record reader RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader); //use reader to read base Parquet File and log file //here all records should be present. Also ensure log records are in order. NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); int numRecordsAtCommit1 = 0; int numRecordsAtCommit2 = 0; Set<Integer> seenKeys = new HashSet<>(); Integer lastSeenKeyFromLog = firstBatchLastRecordKey; while (recordReader.next(key, value)) { Writable[] values = value.get(); String gotCommit = values[0].toString(); String keyStr = values[2].toString(); Integer gotKey = Integer.parseInt(keyStr.substring("key".length())); if (gotCommit.equals(newCommitTime)) { numRecordsAtCommit2++; Assert.assertTrue(gotKey > firstBatchLastRecordKey); Assert.assertTrue(gotKey <= secondBatchLastRecordKey); Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1); lastSeenKeyFromLog++; } else { numRecordsAtCommit1++; Assert.assertTrue(gotKey >= 0); Assert.assertTrue(gotKey <= firstBatchLastRecordKey); } // Ensure unique key Assert.assertFalse(seenKeys.contains(gotKey)); seenKeys.add(gotKey); key = recordReader.createKey(); value = recordReader.createValue(); } Assert.assertEquals(numRecords, numRecordsAtCommit1); Assert.assertEquals(numRecords, numRecordsAtCommit2); Assert.assertEquals(2 * numRecords, seenKeys.size()); }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testReaderWithNestedAndComplexSchema() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime);/* w w w.j a va 2 s . c o m*/ InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // update files or generate new log file String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords); long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); InputFormatTestUtil.deltaCommit(basePath, newCommitTime); //create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); // validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); // use reader to read base Parquet File and log file, merge in flight and return latest commit // here the first 50 records should be updated, see above NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); int numRecordsRead = 0; while (recordReader.next(key, value)) { int currentRecordNo = numRecordsRead; ++numRecordsRead; Writable[] values = value.get(); String recordCommitTime; //check if the record written is with latest commit, here "101" if (numRecordsRead > numberOfLogRecords) { recordCommitTime = commitTime; } else { recordCommitTime = newCommitTime; } String recordCommitTimeSuffix = "@" + recordCommitTime; Assert.assertEquals(values[0].toString(), recordCommitTime); key = recordReader.createKey(); value = recordReader.createValue(); // Assert type STRING Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo); Assert.assertEquals("test value for field: field2", values[6].toString(), "field" + currentRecordNo + recordCommitTimeSuffix); Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo); // Assert type INT IntWritable intWritable = (IntWritable) values[8]; Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(), currentRecordNo + recordCommitTime.hashCode()); // Assert type LONG LongWritable longWritable = (LongWritable) values[9]; Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(), currentRecordNo + recordCommitTime.hashCode()); // Assert type FLOAT FloatWritable floatWritable = (FloatWritable) values[10]; Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(), (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0); // Assert type DOUBLE DoubleWritable doubleWritable = (DoubleWritable) values[11]; Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(), (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0); // Assert type MAP ArrayWritable mapItem = (ArrayWritable) values[12]; Writable mapItemValue1 = mapItem.get()[0]; Writable mapItemValue2 = mapItem.get()[1]; Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get()[0].toString(), "mapItem1"); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get()[0].toString(), "mapItem2"); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get().length, 2); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get().length, 2); Writable mapItemValue1value = ((ArrayWritable) mapItemValue1).get()[1]; Writable mapItemValue2value = ((ArrayWritable) mapItemValue2).get()[1]; Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1", ((ArrayWritable) mapItemValue1value).get()[0].toString(), "item" + currentRecordNo); Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1", ((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo); Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2", ((ArrayWritable) mapItemValue1value).get()[1].toString(), "item" + currentRecordNo + recordCommitTimeSuffix); Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2", ((ArrayWritable) mapItemValue2value).get()[1].toString(), "item2" + currentRecordNo + recordCommitTimeSuffix); // Assert type RECORD ArrayWritable recordItem = (ArrayWritable) values[13]; Writable[] nestedRecord = recordItem.get(); Assert.assertEquals("test value for field: testNestedRecord.isAdmin", ((BooleanWritable) nestedRecord[0]).get(), false); Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), "UserId" + currentRecordNo + recordCommitTimeSuffix); // Assert type ARRAY ArrayWritable arrayValue = (ArrayWritable) values[14]; Writable[] arrayValues = arrayValue.get(); for (int i = 0; i < arrayValues.length; i++) { Assert.assertEquals("test value for field: stringArray", "stringArray" + i + recordCommitTimeSuffix, arrayValues[i].toString()); } } }