List of usage examples for org.apache.hadoop.mapred JobConf get
public String get(String name)
name
property, null
if no such property exists. From source file:com.twitter.maple.hbase.mapred.TableInputFormat.java
License:Apache License
public void validateInput(JobConf job) throws IOException { // expecting exactly one path String tableName = TableInputFormat.getTableName(job); if (tableName == null) { throw new IOException("expecting one table name"); }/*from ww w . ja v a 2 s .com*/ // connected to table? if (getHTable() == null) { throw new IOException("could not connect to table '" + tableName + "'"); } // expecting at least one column String colArg = job.get(COLUMN_LIST); if (colArg == null || colArg.length() == 0) { throw new IOException("expecting at least one column"); } }
From source file:com.twitter.maple.hbase.mapred.TableInputFormat.java
License:Apache License
public static String getTableName(JobConf job) { return job.get(INPUT_TABLE); }
From source file:com.twitter.maple.jdbc.JDBCTap.java
License:Open Source License
@Override public void sinkConfInit(FlowProcess<JobConf> process, JobConf conf) { if (!isSink()) return;/*ww w . j a va2 s . c o m*/ // do not delete if initialized from within a task try { if (isReplace() && conf.get("mapred.task.partition") == null && !deleteResource(conf)) throw new TapException("unable to drop table: " + tableDesc.getTableName()); if (!createResource(conf)) throw new TapException("unable to create table: " + tableDesc.getTableName()); } catch (IOException e) { throw new TapException("error while trying to modify table: " + tableDesc.getTableName()); } if (username == null) DBConfiguration.configureDB(conf, driverClassName, connectionUrl); else DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password); super.sinkConfInit(process, conf); }
From source file:com.twitter.meatlocker.jdbc.JDBCTap.java
License:Open Source License
@Override public void sinkConfInit(HadoopFlowProcess process, JobConf conf) { if (!isSink()) return;/*w w w.j av a 2 s. c o m*/ // do not delete if initialized from within a task try { if (isReplace() && conf.get("mapred.task.partition") == null && !deleteResource(conf)) throw new TapException("unable to drop table: " + tableDesc.getTableName()); if (!createResource(conf)) throw new TapException("unable to create table: " + tableDesc.getTableName()); } catch (IOException e) { throw new TapException("error while trying to modify table: " + tableDesc.getTableName()); } if (username == null) DBConfiguration.configureDB(conf, driverClassName, connectionUrl); else DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password); super.sinkConfInit(process, conf); }
From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezExecutionEngine.java
License:Apache License
@SuppressWarnings("deprecation") private void init(Properties properties) throws ExecException { //First set the ssh socket factory setSSHFactory();/*from w w w .j ava2 s .c om*/ String cluster = null; String nameNode = null; // We need to build a configuration object first in the manner described below // and then get back a properties object to inspect the JOB_TRACKER_LOCATION // and FILE_SYSTEM_LOCATION. The reason to do this is if we looked only at // the existing properties object, we may not get the right settings. So we want // to read the configurations in the order specified below and only then look // for JOB_TRACKER_LOCATION and FILE_SYSTEM_LOCATION. // Hadoop by default specifies two resources, loaded in-order from the classpath: // 1. hadoop-default.xml : Read-only defaults for hadoop. // 2. hadoop-site.xml: Site-specific configuration for a given hadoop installation. // Now add the settings from "properties" object to override any existing properties // All of the above is accomplished in the method call below JobConf jc = null; if (this.pigContext.getExecType() == ExecType.TEZ) { // Check existence of user provided configs String isHadoopConfigsOverriden = properties.getProperty("pig.use.overriden.hadoop.configs"); if (isHadoopConfigsOverriden != null && isHadoopConfigsOverriden.equals("true")) { jc = new JobConf(ConfigurationUtil.toConfiguration(properties)); } else { // Check existence of hadoop-site.xml or core-site.xml in classpath // if user provided confs are not being used Configuration testConf = new Configuration(); ClassLoader cl = testConf.getClassLoader(); URL hadoop_site = cl.getResource(HADOOP_SITE); URL core_site = cl.getResource(CORE_SITE); if (hadoop_site == null && core_site == null) { throw new ExecException( "Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath)." + " If you plan to use local mode, please put -x local option in command line", 4010); } jc = new JobConf(); } jc.addResource("pig-cluster-hadoop-site.xml"); jc.addResource(YARN_SITE); /* // Trick to invoke static initializer of DistributedFileSystem to add hdfs-default.xml // into configuration new DistributedFileSystem(); */ //the method below alters the properties object by overriding the //hadoop properties with the values from properties and recomputing //the properties recomputeProperties(jc, properties); } else { // If we are running in local mode we dont read the hadoop conf file if (properties.getProperty("mapreduce.framework.name") == null) { properties.setProperty("mapreduce.framework.name", "local"); } properties.setProperty(JOB_TRACKER_LOCATION, LOCAL); properties.setProperty(FILE_SYSTEM_LOCATION, "file:///"); properties.setProperty(ALTERNATIVE_FILE_SYSTEM_LOCATION, "file:///"); jc = new JobConf(false); jc.addResource("core-default.xml"); jc.addResource("mapred-default.xml"); jc.addResource("yarn-default.xml"); recomputeProperties(jc, properties); } cluster = jc.get(JOB_TRACKER_LOCATION); nameNode = jc.get(FILE_SYSTEM_LOCATION); if (nameNode == null) nameNode = (String) pigContext.getProperties().get(ALTERNATIVE_FILE_SYSTEM_LOCATION); if (cluster != null && cluster.length() > 0) { if (!cluster.contains(":") && !cluster.equalsIgnoreCase(LOCAL)) { cluster = cluster + ":50020"; } properties.setProperty(JOB_TRACKER_LOCATION, cluster); } if (nameNode != null && nameNode.length() > 0) { if (!nameNode.contains(":") && !nameNode.equalsIgnoreCase(LOCAL)) { nameNode = nameNode + ":8020"; } properties.setProperty(FILE_SYSTEM_LOCATION, nameNode); } log.info("Connecting to hadoop file system at: " + (nameNode == null ? LOCAL : nameNode)); // constructor sets DEFAULT_REPLICATION_FACTOR_KEY ds = new HDataStorage(properties); if (cluster != null && !cluster.equalsIgnoreCase(LOCAL)) { log.info("Connecting to map-reduce job tracker at: " + jc.get(JOB_TRACKER_LOCATION)); } // Set job-specific configuration knobs jobConf = jc; }
From source file:com.twitter.pycascading.CascadingBaseOperationWrapper.java
License:Apache License
private PythonInterpreter setupInterpreter(JobConf jobConf, FlowProcess flowProcess) { String pycascadingDir = null; String sourceDir = null;/*from w w w . j av a 2s . c om*/ String[] modulePaths = null; if ("hadoop".equals(jobConf.get("pycascading.running_mode"))) { try { Path[] archives = DistributedCache.getLocalCacheArchives(jobConf); pycascadingDir = archives[0].toString() + "/"; sourceDir = archives[1].toString() + "/"; modulePaths = new String[archives.length]; int i = 0; for (Path archive : archives) { modulePaths[i++] = archive.toString(); } } catch (IOException e) { throw new RuntimeException(e); } } else { pycascadingDir = System.getProperty("pycascading.root") + "/"; sourceDir = ""; modulePaths = new String[] { pycascadingDir, sourceDir }; } PythonInterpreter interpreter = Main.getInterpreter(); interpreter.execfile(pycascadingDir + "python/pycascading/init_module.py"); interpreter.set("module_paths", modulePaths); interpreter.eval("setup_paths(module_paths)"); // We set the Python variable "map_input_file" to the path to the mapper // input file // But this is unfortunately null with the old Hadoop API, see // https://groups.google.com/group/cascading-user/browse_thread/thread/d65960ad738bebd4/f343e91625cf3c07 // http://lucene.472066.n3.nabble.com/map-input-file-in-20-1-td961619.html // https://issues.apache.org/jira/browse/MAPREDUCE-2166 interpreter.set("map_input_file", jobConf.get("map.input.file")); // We set the Python variable "jobconf" to the MR jobconf interpreter.set("jobconf", jobConf); // The flowProcess passed to the Operation is passed on to the Python // function in the variable flow_process interpreter.set("flow_process", flowProcess); // We need to run the main file first so that imports etc. are defined, // and nested functions can also be used interpreter.execfile(sourceDir + (String) jobConf.get("pycascading.main_file")); return interpreter; }
From source file:com.uber.hoodie.hadoop.realtime.AbstractRealtimeRecordReader.java
License:Apache License
public AbstractRealtimeRecordReader(HoodieRealtimeFileSplit split, JobConf job) { this.split = split; this.jobConf = job; LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)); LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); LOG.info("partitioningColumns ==> " + job.get("partition_columns", "")); try {/*from w w w . ja v a2s . c om*/ this.usesCustomPayload = usesCustomPayload(); LOG.info("usesCustomPayload ==> " + this.usesCustomPayload); baseFileSchema = readSchema(jobConf, split.getPath()); init(); } catch (IOException e) { throw new HoodieIOException( "Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); } }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.java
License:Apache License
@Override public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { LOG.info("Before adding Hoodie columns, Projections :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table; // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases // hoodie additional projection columns are reset after calling setConf and only natural projections // (one found in select queries) are set. things would break because of this. // For e:g _hoodie_record_key would be missing and merge step would throw exceptions. // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction time. this.conf = addRequiredProjectionFields(job); LOG.info(/*from w w w . ja va 2s . c o m*/ "Creating record reader with readCols :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // sanity check Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit, "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split); // Reset the original column ids and names job.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS); job.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES); return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job, super.getRecordReader(split, job, reporter)); }
From source file:com.yahoo.druid.hadoop.HiveDatasourceInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { logger.info("checkPost #5"); String overlordUrl = jobConf.get(CONF_DRUID_OVERLORD_HOSTPORT); Preconditions.checkArgument(overlordUrl != null && !overlordUrl.isEmpty(), CONF_DRUID_OVERLORD_HOSTPORT + " not defined"); logger.info("druid overlord url = " + overlordUrl); String schemaStr = jobConf.get(CONF_DRUID_SCHEMA); Preconditions.checkArgument(schemaStr != null && !schemaStr.isEmpty(), "schema undefined, provide " + CONF_DRUID_SCHEMA); logger.info("schema = " + schemaStr); DatasourceIngestionSpec ingestionSpec = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(schemaStr, DatasourceIngestionSpec.class); String segmentsStr = getSegmentsToLoad(ingestionSpec.getDataSource(), ingestionSpec.getIntervals(), overlordUrl);// w ww . j a v a2 s . c o m logger.info("segments list received from overlord = " + segmentsStr); List<DataSegment> segmentsList = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<DataSegment>>() { }); VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>( Ordering.natural()); for (DataSegment segment : segmentsList) { timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment)); } final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline .lookup(ingestionSpec.getIntervals().get(0)); final List<WindowedDataSegment> windowedSegments = new ArrayList<>(); for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) { for (PartitionChunk<DataSegment> chunk : holder.getObject()) { windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval())); } } jobConf.set(CONF_INPUT_SEGMENTS, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(windowedSegments)); segmentsStr = Preconditions.checkNotNull(jobConf.get(CONF_INPUT_SEGMENTS), "No segments found to read"); List<WindowedDataSegment> segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<WindowedDataSegment>>() { }); if (segments == null || segments.size() == 0) { throw new ISE("No segments found to read"); } logger.info("segments to read " + segmentsStr); long maxSize = numSplits; if (maxSize > 0) { // combining is to happen, let us sort the segments list by size so that // they // are combined appropriately Collections.sort(segments, new Comparator<WindowedDataSegment>() { @Override public int compare(WindowedDataSegment s1, WindowedDataSegment s2) { return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize()); } }); } List<InputSplit> splits = Lists.newArrayList(); List<WindowedDataSegment> list = new ArrayList<>(); long size = 0; // JobConf dummyConf = new JobConf(); Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(jobContext); logger.info("dummyPath : " + paths); jobConf.set("druid.hive.dummyfilename", paths[0].toString()); InputFormat fio = supplier.get(); for (WindowedDataSegment segment : segments) { if (size + segment.getSegment().getSize() > maxSize && size > 0) { splits.add(toDataSourceSplit(list, fio, jobConf, paths[0])); list = Lists.newArrayList(); size = 0; } list.add(segment); size += segment.getSegment().getSize(); } if (list.size() > 0) { splits.add(toDataSourceSplit(list, fio, jobConf, paths[0])); } logger.info("Number of splits: " + splits.size()); for (InputSplit split : splits) { logger.info(split.getClass().getName()); for (String location : split.getLocations()) logger.info(location); } return Iterables.toArray(splits, InputSplit.class); }
From source file:com.yolodata.tbana.cascading.splunk.SplunkTapTest.java
License:Open Source License
@Test public void testSetConfKey() { String key = "Some.Unique.key"; String value = "SomeValue"; Properties propsWithKey = new Properties(); propsWithKey.put(key, value);/*from www . jav a2 s.c om*/ SplunkTap tap = new SplunkTap(propsWithKey, inputScheme); JobConf conf = new JobConf(); assertEquals(null, conf.get(key)); tap.setConfKey(conf, key); assertEquals(value, conf.get(key)); }