Example usage for org.apache.hadoop.mapred JobConf get

List of usage examples for org.apache.hadoop.mapred JobConf get

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf get.

Prototype

public String get(String name) 

Source Link

Document

Get the value of the name property, null if no such property exists.

Usage

From source file:com.twitter.maple.hbase.mapred.TableInputFormat.java

License:Apache License

public void validateInput(JobConf job) throws IOException {
    // expecting exactly one path
    String tableName = TableInputFormat.getTableName(job);
    if (tableName == null) {
        throw new IOException("expecting one table name");
    }/*from ww  w . ja v a  2 s  .com*/

    // connected to table?
    if (getHTable() == null) {
        throw new IOException("could not connect to table '" + tableName + "'");
    }

    // expecting at least one column
    String colArg = job.get(COLUMN_LIST);
    if (colArg == null || colArg.length() == 0) {
        throw new IOException("expecting at least one column");
    }
}

From source file:com.twitter.maple.hbase.mapred.TableInputFormat.java

License:Apache License

public static String getTableName(JobConf job) {
    return job.get(INPUT_TABLE);
}

From source file:com.twitter.maple.jdbc.JDBCTap.java

License:Open Source License

@Override
public void sinkConfInit(FlowProcess<JobConf> process, JobConf conf) {
    if (!isSink())
        return;/*ww  w  .  j  a  va2 s .  c  o m*/

    // do not delete if initialized from within a task
    try {
        if (isReplace() && conf.get("mapred.task.partition") == null && !deleteResource(conf))
            throw new TapException("unable to drop table: " + tableDesc.getTableName());

        if (!createResource(conf))
            throw new TapException("unable to create table: " + tableDesc.getTableName());
    } catch (IOException e) {
        throw new TapException("error while trying to modify table: " + tableDesc.getTableName());
    }

    if (username == null)
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl);
    else
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password);

    super.sinkConfInit(process, conf);
}

From source file:com.twitter.meatlocker.jdbc.JDBCTap.java

License:Open Source License

@Override
public void sinkConfInit(HadoopFlowProcess process, JobConf conf) {
    if (!isSink())
        return;/*w  w w.j  av  a  2  s.  c o  m*/

    // do not delete if initialized from within a task
    try {
        if (isReplace() && conf.get("mapred.task.partition") == null && !deleteResource(conf))
            throw new TapException("unable to drop table: " + tableDesc.getTableName());

        if (!createResource(conf))
            throw new TapException("unable to create table: " + tableDesc.getTableName());
    } catch (IOException e) {
        throw new TapException("error while trying to modify table: " + tableDesc.getTableName());
    }

    if (username == null)
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl);
    else
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password);

    super.sinkConfInit(process, conf);
}

From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezExecutionEngine.java

License:Apache License

@SuppressWarnings("deprecation")
private void init(Properties properties) throws ExecException {
    //First set the ssh socket factory
    setSSHFactory();/*from  w w  w .j  ava2 s  .c om*/

    String cluster = null;
    String nameNode = null;

    // We need to build a configuration object first in the manner described below
    // and then get back a properties object to inspect the JOB_TRACKER_LOCATION
    // and FILE_SYSTEM_LOCATION. The reason to do this is if we looked only at
    // the existing properties object, we may not get the right settings. So we want
    // to read the configurations in the order specified below and only then look
    // for JOB_TRACKER_LOCATION and FILE_SYSTEM_LOCATION.

    // Hadoop by default specifies two resources, loaded in-order from the classpath:
    // 1. hadoop-default.xml : Read-only defaults for hadoop.
    // 2. hadoop-site.xml: Site-specific configuration for a given hadoop installation.
    // Now add the settings from "properties" object to override any existing properties
    // All of the above is accomplished in the method call below

    JobConf jc = null;
    if (this.pigContext.getExecType() == ExecType.TEZ) {
        // Check existence of user provided configs
        String isHadoopConfigsOverriden = properties.getProperty("pig.use.overriden.hadoop.configs");
        if (isHadoopConfigsOverriden != null && isHadoopConfigsOverriden.equals("true")) {
            jc = new JobConf(ConfigurationUtil.toConfiguration(properties));
        } else {
            // Check existence of hadoop-site.xml or core-site.xml in classpath
            // if user provided confs are not being used
            Configuration testConf = new Configuration();
            ClassLoader cl = testConf.getClassLoader();
            URL hadoop_site = cl.getResource(HADOOP_SITE);
            URL core_site = cl.getResource(CORE_SITE);

            if (hadoop_site == null && core_site == null) {
                throw new ExecException(
                        "Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath)."
                                + " If you plan to use local mode, please put -x local option in command line",
                        4010);
            }
            jc = new JobConf();
        }
        jc.addResource("pig-cluster-hadoop-site.xml");
        jc.addResource(YARN_SITE);

        /*
        // Trick to invoke static initializer of DistributedFileSystem to add hdfs-default.xml 
        // into configuration
        new DistributedFileSystem();
        */
        //the method below alters the properties object by overriding the
        //hadoop properties with the values from properties and recomputing
        //the properties
        recomputeProperties(jc, properties);
    } else {
        // If we are running in local mode we dont read the hadoop conf file
        if (properties.getProperty("mapreduce.framework.name") == null) {
            properties.setProperty("mapreduce.framework.name", "local");
        }
        properties.setProperty(JOB_TRACKER_LOCATION, LOCAL);
        properties.setProperty(FILE_SYSTEM_LOCATION, "file:///");
        properties.setProperty(ALTERNATIVE_FILE_SYSTEM_LOCATION, "file:///");

        jc = new JobConf(false);
        jc.addResource("core-default.xml");
        jc.addResource("mapred-default.xml");
        jc.addResource("yarn-default.xml");
        recomputeProperties(jc, properties);
    }

    cluster = jc.get(JOB_TRACKER_LOCATION);
    nameNode = jc.get(FILE_SYSTEM_LOCATION);
    if (nameNode == null)
        nameNode = (String) pigContext.getProperties().get(ALTERNATIVE_FILE_SYSTEM_LOCATION);

    if (cluster != null && cluster.length() > 0) {
        if (!cluster.contains(":") && !cluster.equalsIgnoreCase(LOCAL)) {
            cluster = cluster + ":50020";
        }
        properties.setProperty(JOB_TRACKER_LOCATION, cluster);
    }

    if (nameNode != null && nameNode.length() > 0) {
        if (!nameNode.contains(":") && !nameNode.equalsIgnoreCase(LOCAL)) {
            nameNode = nameNode + ":8020";
        }
        properties.setProperty(FILE_SYSTEM_LOCATION, nameNode);
    }

    log.info("Connecting to hadoop file system at: " + (nameNode == null ? LOCAL : nameNode));
    // constructor sets DEFAULT_REPLICATION_FACTOR_KEY
    ds = new HDataStorage(properties);

    if (cluster != null && !cluster.equalsIgnoreCase(LOCAL)) {
        log.info("Connecting to map-reduce job tracker at: " + jc.get(JOB_TRACKER_LOCATION));
    }

    // Set job-specific configuration knobs
    jobConf = jc;
}

From source file:com.twitter.pycascading.CascadingBaseOperationWrapper.java

License:Apache License

private PythonInterpreter setupInterpreter(JobConf jobConf, FlowProcess flowProcess) {
    String pycascadingDir = null;
    String sourceDir = null;/*from w  w  w  . j  av a 2s  .  c om*/
    String[] modulePaths = null;
    if ("hadoop".equals(jobConf.get("pycascading.running_mode"))) {
        try {
            Path[] archives = DistributedCache.getLocalCacheArchives(jobConf);
            pycascadingDir = archives[0].toString() + "/";
            sourceDir = archives[1].toString() + "/";
            modulePaths = new String[archives.length];
            int i = 0;
            for (Path archive : archives) {
                modulePaths[i++] = archive.toString();
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    } else {
        pycascadingDir = System.getProperty("pycascading.root") + "/";
        sourceDir = "";
        modulePaths = new String[] { pycascadingDir, sourceDir };
    }
    PythonInterpreter interpreter = Main.getInterpreter();
    interpreter.execfile(pycascadingDir + "python/pycascading/init_module.py");
    interpreter.set("module_paths", modulePaths);
    interpreter.eval("setup_paths(module_paths)");

    // We set the Python variable "map_input_file" to the path to the mapper
    // input file
    // But this is unfortunately null with the old Hadoop API, see
    // https://groups.google.com/group/cascading-user/browse_thread/thread/d65960ad738bebd4/f343e91625cf3c07
    // http://lucene.472066.n3.nabble.com/map-input-file-in-20-1-td961619.html
    // https://issues.apache.org/jira/browse/MAPREDUCE-2166
    interpreter.set("map_input_file", jobConf.get("map.input.file"));

    // We set the Python variable "jobconf" to the MR jobconf
    interpreter.set("jobconf", jobConf);

    // The flowProcess passed to the Operation is passed on to the Python
    // function in the variable flow_process
    interpreter.set("flow_process", flowProcess);

    // We need to run the main file first so that imports etc. are defined,
    // and nested functions can also be used
    interpreter.execfile(sourceDir + (String) jobConf.get("pycascading.main_file"));
    return interpreter;
}

From source file:com.uber.hoodie.hadoop.realtime.AbstractRealtimeRecordReader.java

License:Apache License

public AbstractRealtimeRecordReader(HoodieRealtimeFileSplit split, JobConf job) {
    this.split = split;
    this.jobConf = job;
    LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR));
    LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
    LOG.info("partitioningColumns ==> " + job.get("partition_columns", ""));
    try {/*from  w  w w  .  ja v  a2s . c  om*/
        this.usesCustomPayload = usesCustomPayload();
        LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
        baseFileSchema = readSchema(jobConf, split.getPath());
        init();
    } catch (IOException e) {
        throw new HoodieIOException(
                "Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
    }
}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.java

License:Apache License

@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job,
        final Reporter reporter) throws IOException {

    LOG.info("Before adding Hoodie columns, Projections :"
            + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :"
            + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));

    // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table;
    // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases
    // hoodie additional projection columns are reset after calling setConf and only natural projections
    // (one found in select queries) are set. things would break because of this.
    // For e:g _hoodie_record_key would be missing and merge step would throw exceptions.
    // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction time.
    this.conf = addRequiredProjectionFields(job);

    LOG.info(/*from   w  w w . ja  va 2s .  c  o  m*/
            "Creating record reader with readCols :" + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
                    + ", Ids :" + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
    // sanity check
    Preconditions.checkArgument(split instanceof HoodieRealtimeFileSplit,
            "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split);

    // Reset the original column ids and names
    job.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, READ_COLUMN_IDS);
    job.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, READ_COLUMN_NAMES);

    return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, job,
            super.getRecordReader(split, job, reporter));
}

From source file:com.yahoo.druid.hadoop.HiveDatasourceInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    logger.info("checkPost #5");

    String overlordUrl = jobConf.get(CONF_DRUID_OVERLORD_HOSTPORT);
    Preconditions.checkArgument(overlordUrl != null && !overlordUrl.isEmpty(),
            CONF_DRUID_OVERLORD_HOSTPORT + " not defined");

    logger.info("druid overlord url = " + overlordUrl);

    String schemaStr = jobConf.get(CONF_DRUID_SCHEMA);

    Preconditions.checkArgument(schemaStr != null && !schemaStr.isEmpty(),
            "schema undefined,  provide " + CONF_DRUID_SCHEMA);
    logger.info("schema = " + schemaStr);

    DatasourceIngestionSpec ingestionSpec = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(schemaStr,
            DatasourceIngestionSpec.class);
    String segmentsStr = getSegmentsToLoad(ingestionSpec.getDataSource(), ingestionSpec.getIntervals(),
            overlordUrl);// w ww .  j a v  a2 s  . c  o m
    logger.info("segments list received from overlord = " + segmentsStr);

    List<DataSegment> segmentsList = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr,
            new TypeReference<List<DataSegment>>() {
            });
    VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>(
            Ordering.natural());
    for (DataSegment segment : segmentsList) {
        timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment));
    }
    final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline
            .lookup(ingestionSpec.getIntervals().get(0));
    final List<WindowedDataSegment> windowedSegments = new ArrayList<>();
    for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) {
        for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
            windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval()));
        }
    }

    jobConf.set(CONF_INPUT_SEGMENTS, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(windowedSegments));

    segmentsStr = Preconditions.checkNotNull(jobConf.get(CONF_INPUT_SEGMENTS), "No segments found to read");
    List<WindowedDataSegment> segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr,
            new TypeReference<List<WindowedDataSegment>>() {
            });
    if (segments == null || segments.size() == 0) {
        throw new ISE("No segments found to read");
    }

    logger.info("segments to read " + segmentsStr);

    long maxSize = numSplits;

    if (maxSize > 0) {
        // combining is to happen, let us sort the segments list by size so that
        // they
        // are combined appropriately
        Collections.sort(segments, new Comparator<WindowedDataSegment>() {
            @Override
            public int compare(WindowedDataSegment s1, WindowedDataSegment s2) {
                return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize());
            }
        });
    }

    List<InputSplit> splits = Lists.newArrayList();

    List<WindowedDataSegment> list = new ArrayList<>();
    long size = 0;

    // JobConf dummyConf = new JobConf();
    Job job = new Job(jobConf);
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
    Path[] paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(jobContext);
    logger.info("dummyPath : " + paths);

    jobConf.set("druid.hive.dummyfilename", paths[0].toString());

    InputFormat fio = supplier.get();
    for (WindowedDataSegment segment : segments) {
        if (size + segment.getSegment().getSize() > maxSize && size > 0) {
            splits.add(toDataSourceSplit(list, fio, jobConf, paths[0]));
            list = Lists.newArrayList();
            size = 0;
        }

        list.add(segment);
        size += segment.getSegment().getSize();
    }

    if (list.size() > 0) {
        splits.add(toDataSourceSplit(list, fio, jobConf, paths[0]));
    }

    logger.info("Number of splits: " + splits.size());
    for (InputSplit split : splits) {
        logger.info(split.getClass().getName());
        for (String location : split.getLocations())
            logger.info(location);
    }
    return Iterables.toArray(splits, InputSplit.class);
}

From source file:com.yolodata.tbana.cascading.splunk.SplunkTapTest.java

License:Open Source License

@Test
public void testSetConfKey() {

    String key = "Some.Unique.key";
    String value = "SomeValue";

    Properties propsWithKey = new Properties();
    propsWithKey.put(key, value);/*from  www  . jav a2 s.c om*/

    SplunkTap tap = new SplunkTap(propsWithKey, inputScheme);
    JobConf conf = new JobConf();

    assertEquals(null, conf.get(key));
    tap.setConfKey(conf, key);
    assertEquals(value, conf.get(key));
}