Example usage for org.apache.hadoop.mapred JobConf set

List of usage examples for org.apache.hadoop.mapred JobConf set

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf set.

Prototype

public void set(String name, String value) 

Source Link

Document

Set the value of the name property.

Usage

From source file:cascading.flow.FlowStep.java

License:Open Source License

private void addComparators(JobConf conf, String property, Map<String, Fields> map) throws IOException {
    Iterator<Fields> fieldsIterator = map.values().iterator();

    if (!fieldsIterator.hasNext())
        return;/*  w  w  w  . jav  a 2 s. c o m*/

    Fields fields = fieldsIterator.next();

    if (fields.hasComparators()) {
        conf.set(property, Util.serializeBase64(fields));
        return;
    }

    // use resolved fields if there are no comparators.
    Set<Scope> previousScopes = getPreviousScopes(getGroup());

    fields = previousScopes.iterator().next().getOutValuesFields();

    if (fields.size() != 0) // allows fields.UNKNOWN to be used
        conf.setInt(property + ".size", fields.size());

    return;
}

From source file:cascading.flow.hadoop.HadoopFlow.java

License:Open Source License

@Override
protected void setConfigProperty(JobConf config, Object key, Object value) {
    // don't let these objects pass, even though toString is called below.
    if (value instanceof Class || value instanceof JobConf)
        return;// w ww.jav a  2  s .c  o  m

    config.set(key.toString(), value.toString());
}

From source file:cascading.flow.hadoop.HadoopFlowStep.java

License:Open Source License

public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) {
    JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig);

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH)));

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapRunnerClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(flowProcess, conf);//from  w ww. j  a  v a2s. co  m

    initFromSink(flowProcess, conf);

    initFromTraps(flowProcess, conf);

    initFromStepConfigDef(conf);

    int numSinkParts = getSink().getScheme().getNumSinkParts();

    if (numSinkParts != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(numSinkParts);
        else
            conf.setNumMapTasks(numSinkParts);
    } else if (getGroup() != null) {
        int gatherPartitions = conf.getNumReduceTasks();

        if (gatherPartitions == 0)
            gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0);

        if (gatherPartitions == 0)
            throw new FlowException(getName(),
                    "a default number of gather partitions must be set, see FlowRuntimeProps");

        conf.setNumReduceTasks(gatherPartitions);
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);
        conf.setPartitionerClass(GroupingPartitioner.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this,
                    getGroup());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingSortingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    String versionString = Version.getRelease();

    if (versionString != null)
        conf.set("cascading.version", versionString);

    conf.set(CASCADING_FLOW_STEP_ID, getID());
    conf.set("cascading.flow.step.num", Integer.toString(getOrdinal()));

    HadoopUtil.setIsInflow(conf);

    Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator();

    String mapState = pack(iterator.next(), conf);
    String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf);

    // hadoop 20.2 doesn't like dist cache when using local mode
    int maxSize = Short.MAX_VALUE;

    int length = mapState.length() + reduceState.length();

    if (isHadoopLocalMode(conf) || length < maxSize) // seems safe
    {
        conf.set("cascading.flow.step.node.map", mapState);

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce", reduceState);
    } else {
        conf.set("cascading.flow.step.node.map.path",
                HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState));

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce.path",
                    HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState));
    }

    return conf;
}

From source file:cascading.flow.hadoop.HadoopFlowStep.java

License:Open Source License

protected void initFromSources(FlowProcess<JobConf> flowProcess, JobConf conf) {
    // handles case where same tap is used on multiple branches
    // we do not want to init the same tap multiple times
    Set<Tap> uniqueSources = getUniqueStreamedSources();

    JobConf[] streamedJobs = new JobConf[uniqueSources.size()];
    int i = 0;/*from w w  w . jav  a2  s .  c o m*/

    for (Tap tap : uniqueSources) {
        if (tap.getIdentifier() == null)
            throw new IllegalStateException("tap may not have null identifier: " + tap.toString());

        streamedJobs[i] = flowProcess.copyConfig(conf);

        streamedJobs[i].set("cascading.step.source", Tap.id(tap));

        tap.sourceConfInit(flowProcess, streamedJobs[i]);

        i++;
    }

    Set<Tap> accumulatedSources = getAllAccumulatedSources();

    for (Tap tap : accumulatedSources) {
        JobConf accumulatedJob = flowProcess.copyConfig(conf);

        tap.sourceConfInit(flowProcess, accumulatedJob);

        Map<String, String> map = flowProcess.diffConfigIntoMap(conf, accumulatedJob);
        conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf));

        try {
            if (DistributedCache.getCacheFiles(accumulatedJob) != null)
                DistributedCache.setCacheFiles(DistributedCache.getCacheFiles(accumulatedJob), conf);
        } catch (IOException exception) {
            throw new CascadingException(exception);
        }
    }

    MultiInputFormat.addInputFormat(conf, streamedJobs); //must come last
}

From source file:cascading.flow.hadoop.HadoopUtil.java

License:Open Source License

public static JobConf createJobConf(Map<Object, Object> properties, JobConf defaultJobconf) {
    JobConf jobConf = defaultJobconf == null ? new JobConf() : new JobConf(defaultJobconf);

    if (properties == null)
        return jobConf;

    Set<Object> keys = new HashSet<Object>(properties.keySet());

    // keys will only be grabbed if both key/value are String, so keep orig keys
    if (properties instanceof Properties)
        keys.addAll(((Properties) properties).stringPropertyNames());

    for (Object key : keys) {
        Object value = properties.get(key);

        if (value == null && properties instanceof Properties && key instanceof String)
            value = ((Properties) properties).getProperty((String) key);

        if (value == null) // don't stuff null values
            continue;

        // don't let these objects pass, even though toString is called below.
        if (value instanceof Class || value instanceof JobConf)
            continue;

        jobConf.set(key.toString(), value.toString());
    }//  w  w w.  j av  a 2s. c o m

    return jobConf;
}

From source file:cascading.flow.hadoop.planner.HadoopPlanner.java

License:Open Source License

/**
 * Method copyProperties adds the given Map values to the given JobConf object.
 *
 * @param jobConf    of type JobConf//from  w  w  w. j  av  a  2s.  c o m
 * @param properties of type Map
 */
public static void copyProperties(JobConf jobConf, Map<Object, Object> properties) {
    if (properties instanceof Properties) {
        Properties props = (Properties) properties;
        Set<String> keys = props.stringPropertyNames();

        for (String key : keys)
            jobConf.set(key, props.getProperty(key));
    } else {
        for (Map.Entry<Object, Object> entry : properties.entrySet()) {
            if (entry.getValue() != null)
                jobConf.set(entry.getKey().toString(), entry.getValue().toString());
        }
    }
}

From source file:cascading.flow.stack.StackElement.java

License:Open Source License

private static TapCollector getTrapCollector(Tap trap, JobConf jobConf) {
    TapCollector trapCollector = trapCollectors.get(trap);

    if (trapCollector == null) {
        try {/*from w w  w .j a v  a2 s .c  o m*/
            jobConf = new JobConf(jobConf);

            int id = jobConf.getInt("cascading.flow.step.id", 0);
            String partname;

            if (jobConf.getBoolean("mapred.task.is.map", true))
                partname = String.format("-m-%05d-", id);
            else
                partname = String.format("-r-%05d-", id);

            jobConf.set("cascading.tapcollector.partname", "%s%spart" + partname + "%05d");

            trapCollector = (TapCollector) trap.openForWrite(jobConf);
            trapCollectors.put(trap, trapCollector);
        } catch (IOException exception) {
            throw new StackException(exception);
        }
    }

    return trapCollector;
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

public Vertex createVertex(FlowProcess<TezConfiguration> flowProcess, TezConfiguration initializedConfig,
        FlowNode flowNode) {/* ww  w  .  j a  v  a  2 s .c o m*/
    JobConf conf = new JobConf(initializedConfig);

    addInputOutputMapping(conf, flowNode);

    conf.setBoolean("mapred.used.genericoptionsparser", true);

    Map<String, LocalResource> taskLocalResources = new HashMap<>();

    Map<FlowElement, Configuration> sourceConfigs = initFromSources(flowNode, flowProcess, conf,
            taskLocalResources);
    Map<FlowElement, Configuration> sinkConfigs = initFromSinks(flowNode, flowProcess, conf);

    initFromTraps(flowNode, flowProcess, conf);

    initFromNodeConfigDef(flowNode, conf);

    // force step to local mode if any tap is local
    setLocalMode(initializedConfig, conf, null);

    conf.set("cascading.flow.node.num", Integer.toString(flowNode.getOrdinal()));

    int parallelism = getParallelism(flowNode, conf);

    if (parallelism == 0)
        throw new FlowException(getName(),
                "the default number of gather partitions must be set, see cascading.flow.FlowRuntimeProps");

    Vertex vertex = newVertex(flowNode, conf, parallelism);

    if (!taskLocalResources.isEmpty())
        vertex.addTaskLocalFiles(taskLocalResources);

    for (FlowElement flowElement : sourceConfigs.keySet()) {
        if (!(flowElement instanceof Tap))
            continue;

        Configuration sourceConf = sourceConfigs.get(flowElement);

        // not setting the new-api value could result in failures if not set by the Scheme
        if (sourceConf.get("mapred.mapper.new-api") == null)
            HadoopUtil.setNewApi(sourceConf, sourceConf.get("mapred.input.format.class",
                    sourceConf.get("mapreduce.job.inputformat.class")));

        // unfortunately we cannot just load the input format and set it on the builder with also pulling all other
        // values out of the configuration.
        MRInput.MRInputConfigBuilder configBuilder = MRInput.createConfigBuilder(sourceConf, null);

        // grouping splits loses file name info, breaking partition tap default impl
        if (flowElement instanceof PartitionTap) // todo: generify
            configBuilder.groupSplits(false);

        DataSourceDescriptor dataSourceDescriptor = configBuilder.build();

        vertex.addDataSource(FlowElements.id(flowElement), dataSourceDescriptor);
    }

    for (FlowElement flowElement : sinkConfigs.keySet()) {
        if (!(flowElement instanceof Tap))
            continue;

        Configuration sinkConf = sinkConfigs.get(flowElement);

        Class outputFormatClass;
        String outputPath;

        // we have to set sane defaults if not set by the tap
        // typically the case of MultiSinkTap
        String formatClassName = sinkConf.get("mapred.output.format.class",
                sinkConf.get("mapreduce.job.outputformat.class"));

        if (formatClassName == null) {
            outputFormatClass = TextOutputFormat.class; // unused, use "new" api, its the default
            outputPath = Hfs.getTempPath(sinkConf).toString(); // unused
        } else {
            outputFormatClass = Util.loadClass(formatClassName);
            outputPath = getOutputPath(sinkConf);
        }

        if (outputPath == null && getOutputPath(sinkConf) == null && isFileOutputFormat(outputFormatClass))
            outputPath = Hfs.getTempPath(sinkConf).toString(); // unused

        MROutput.MROutputConfigBuilder configBuilder = MROutput.createConfigBuilder(sinkConf, outputFormatClass,
                outputPath);

        DataSinkDescriptor dataSinkDescriptor = configBuilder.build();

        vertex.addDataSink(FlowElements.id(flowElement), dataSinkDescriptor);
    }

    addRemoteDebug(flowNode, vertex);
    addRemoteProfiling(flowNode, vertex);

    return vertex;
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

private void addInputOutputMapping(JobConf conf, FlowNode flowNode) {
    FlowNodeGraph flowNodeGraph = getFlowNodeGraph();
    Set<ProcessEdge> incomingEdges = flowNodeGraph.incomingEdgesOf(flowNode);

    for (ProcessEdge processEdge : incomingEdges)
        conf.set("cascading.node.source." + processEdge.getID(),
                flowNodeGraph.getEdgeSource(processEdge).getID());

    Set<ProcessEdge> outgoingEdges = flowNodeGraph.outgoingEdgesOf(flowNode);

    for (ProcessEdge processEdge : outgoingEdges)
        conf.set("cascading.node.sink." + processEdge.getID(),
                flowNodeGraph.getEdgeTarget(processEdge).getID());
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

protected Map<FlowElement, Configuration> initFromSources(FlowNode flowNode,
        FlowProcess<TezConfiguration> flowProcess, Configuration conf,
        Map<String, LocalResource> taskLocalResources) {
    Set<? extends FlowElement> accumulatedSources = flowNode.getSourceElements(StreamMode.Accumulated);

    for (FlowElement element : accumulatedSources) {
        if (element instanceof Tap) {
            JobConf current = new JobConf(conf);
            Tap tap = (Tap) element;//w  w w .  ja  v a  2  s. c o  m

            if (tap.getIdentifier() == null)
                throw new IllegalStateException("tap may not have null identifier: " + tap.toString());

            tap.sourceConfInit(flowProcess, current);

            Collection<String> paths = current.getStringCollection(CASCADING_LOCAL_RESOURCES + Tap.id(tap));

            if (!paths.isEmpty()) {
                String flowStagingPath = ((Hadoop2TezFlow) getFlow()).getFlowStagingPath();
                String resourceSubPath = Tap.id(tap);
                Map<Path, Path> pathMap = TezUtil.addToClassPath(current, flowStagingPath, resourceSubPath,
                        paths, LocalResourceType.FILE, taskLocalResources, null);

                current.setStrings(CASCADING_REMOTE_RESOURCES + Tap.id(tap),
                        taskLocalResources.keySet().toArray(new String[taskLocalResources.size()]));

                allLocalResources.putAll(taskLocalResources);
                syncPaths.putAll(pathMap);
            }

            Map<String, String> map = flowProcess.diffConfigIntoMap(new TezConfiguration(conf),
                    new TezConfiguration(current));
            conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf));

            setLocalMode(conf, current, tap);
        }
    }

    Set<FlowElement> sources = new HashSet<>(flowNode.getSourceElements());

    sources.removeAll(accumulatedSources);

    if (sources.isEmpty())
        throw new IllegalStateException("all sources marked as accumulated");

    Map<FlowElement, Configuration> configs = new HashMap<>();

    for (FlowElement element : sources) {
        JobConf current = new JobConf(conf);

        String id = FlowElements.id(element);

        current.set("cascading.node.source", id);

        if (element instanceof Tap) {
            Tap tap = (Tap) element;

            if (tap.getIdentifier() == null)
                throw new IllegalStateException("tap may not have null identifier: " + tap.toString());

            tap.sourceConfInit(flowProcess, current);

            setLocalMode(conf, current, tap);
        }

        configs.put(element, current);
    }

    return configs;
}