Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults) 

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:cascading.flow.hadoop.util.HadoopUtil.java

License:Open Source License

public static JobConf asJobConfInstance(Configuration configuration) {
    if (configuration instanceof JobConf)
        return (JobConf) configuration;

    return new JobConf(configuration);
}

From source file:cascading.flow.hadoop.util.HadoopUtil.java

License:Open Source License

public static JobConf copyJobConf(JobConf parentJobConf) {
    if (parentJobConf == null)
        throw new IllegalArgumentException("parent may not be null");

    // see https://github.com/Cascading/cascading/pull/21
    // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in
    // case those Credentials are mutated later on down the road (which they will be, during job submission, in
    // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing.
    final Configuration configurationCopy = new Configuration(parentJobConf);
    final JobConf jobConf = new JobConf(configurationCopy);

    jobConf.getCredentials().addAll(parentJobConf.getCredentials());

    return jobConf;
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testFlow() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);/*from   w  w  w  .  j  av  a  2s .c  om*/

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath1));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(flow.openSource(), 10);

    flow.complete();

    validateLength(flow.openSink(), 10);
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testCascade() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);/*from  www .j a va 2  s  .  co m*/

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true);
    Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow"));

    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true);
    Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow"));

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true)));
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true)));
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    JobConf thirdConf = new JobConf(defaultConf);
    thirdConf.setJobName("third-mr");

    thirdConf.setOutputKeyClass(LongWritable.class);
    thirdConf.setOutputValueClass(Text.class);

    thirdConf.setMapperClass(IdentityMapper.class);
    thirdConf.setReducerClass(IdentityReducer.class);

    thirdConf.setInputFormat(TextInputFormat.class);
    thirdConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true)));
    FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true)));

    Flow thirdMR = new MapReduceFlow(thirdConf, true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    //    cascade.writeDOT( "mrcascade.dot" );

    cascade.complete();

    validateLength(thirdMR.openSink(), 10);
}

From source file:cascading.flow.stack.StackElement.java

License:Open Source License

private static TapCollector getTrapCollector(Tap trap, JobConf jobConf) {
    TapCollector trapCollector = trapCollectors.get(trap);

    if (trapCollector == null) {
        try {/*from   w  ww.  j  a  v  a2s  .c o  m*/
            jobConf = new JobConf(jobConf);

            int id = jobConf.getInt("cascading.flow.step.id", 0);
            String partname;

            if (jobConf.getBoolean("mapred.task.is.map", true))
                partname = String.format("-m-%05d-", id);
            else
                partname = String.format("-r-%05d-", id);

            jobConf.set("cascading.tapcollector.partname", "%s%spart" + partname + "%05d");

            trapCollector = (TapCollector) trap.openForWrite(jobConf);
            trapCollectors.put(trap, trapCollector);
        } catch (IOException exception) {
            throw new StackException(exception);
        }
    }

    return trapCollector;
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

public Vertex createVertex(FlowProcess<TezConfiguration> flowProcess, TezConfiguration initializedConfig,
        FlowNode flowNode) {/*w w w.j  a v a  2s. c  o m*/
    JobConf conf = new JobConf(initializedConfig);

    addInputOutputMapping(conf, flowNode);

    conf.setBoolean("mapred.used.genericoptionsparser", true);

    Map<String, LocalResource> taskLocalResources = new HashMap<>();

    Map<FlowElement, Configuration> sourceConfigs = initFromSources(flowNode, flowProcess, conf,
            taskLocalResources);
    Map<FlowElement, Configuration> sinkConfigs = initFromSinks(flowNode, flowProcess, conf);

    initFromTraps(flowNode, flowProcess, conf);

    initFromNodeConfigDef(flowNode, conf);

    // force step to local mode if any tap is local
    setLocalMode(initializedConfig, conf, null);

    conf.set("cascading.flow.node.num", Integer.toString(flowNode.getOrdinal()));

    int parallelism = getParallelism(flowNode, conf);

    if (parallelism == 0)
        throw new FlowException(getName(),
                "the default number of gather partitions must be set, see cascading.flow.FlowRuntimeProps");

    Vertex vertex = newVertex(flowNode, conf, parallelism);

    if (!taskLocalResources.isEmpty())
        vertex.addTaskLocalFiles(taskLocalResources);

    for (FlowElement flowElement : sourceConfigs.keySet()) {
        if (!(flowElement instanceof Tap))
            continue;

        Configuration sourceConf = sourceConfigs.get(flowElement);

        // not setting the new-api value could result in failures if not set by the Scheme
        if (sourceConf.get("mapred.mapper.new-api") == null)
            HadoopUtil.setNewApi(sourceConf, sourceConf.get("mapred.input.format.class",
                    sourceConf.get("mapreduce.job.inputformat.class")));

        // unfortunately we cannot just load the input format and set it on the builder with also pulling all other
        // values out of the configuration.
        MRInput.MRInputConfigBuilder configBuilder = MRInput.createConfigBuilder(sourceConf, null);

        // grouping splits loses file name info, breaking partition tap default impl
        if (flowElement instanceof PartitionTap) // todo: generify
            configBuilder.groupSplits(false);

        DataSourceDescriptor dataSourceDescriptor = configBuilder.build();

        vertex.addDataSource(FlowElements.id(flowElement), dataSourceDescriptor);
    }

    for (FlowElement flowElement : sinkConfigs.keySet()) {
        if (!(flowElement instanceof Tap))
            continue;

        Configuration sinkConf = sinkConfigs.get(flowElement);

        Class outputFormatClass;
        String outputPath;

        // we have to set sane defaults if not set by the tap
        // typically the case of MultiSinkTap
        String formatClassName = sinkConf.get("mapred.output.format.class",
                sinkConf.get("mapreduce.job.outputformat.class"));

        if (formatClassName == null) {
            outputFormatClass = TextOutputFormat.class; // unused, use "new" api, its the default
            outputPath = Hfs.getTempPath(sinkConf).toString(); // unused
        } else {
            outputFormatClass = Util.loadClass(formatClassName);
            outputPath = getOutputPath(sinkConf);
        }

        if (outputPath == null && getOutputPath(sinkConf) == null && isFileOutputFormat(outputFormatClass))
            outputPath = Hfs.getTempPath(sinkConf).toString(); // unused

        MROutput.MROutputConfigBuilder configBuilder = MROutput.createConfigBuilder(sinkConf, outputFormatClass,
                outputPath);

        DataSinkDescriptor dataSinkDescriptor = configBuilder.build();

        vertex.addDataSink(FlowElements.id(flowElement), dataSinkDescriptor);
    }

    addRemoteDebug(flowNode, vertex);
    addRemoteProfiling(flowNode, vertex);

    return vertex;
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

protected Map<FlowElement, Configuration> initFromSources(FlowNode flowNode,
        FlowProcess<TezConfiguration> flowProcess, Configuration conf,
        Map<String, LocalResource> taskLocalResources) {
    Set<? extends FlowElement> accumulatedSources = flowNode.getSourceElements(StreamMode.Accumulated);

    for (FlowElement element : accumulatedSources) {
        if (element instanceof Tap) {
            JobConf current = new JobConf(conf);
            Tap tap = (Tap) element;//from  ww  w .  j  a  v a  2s.  co  m

            if (tap.getIdentifier() == null)
                throw new IllegalStateException("tap may not have null identifier: " + tap.toString());

            tap.sourceConfInit(flowProcess, current);

            Collection<String> paths = current.getStringCollection(CASCADING_LOCAL_RESOURCES + Tap.id(tap));

            if (!paths.isEmpty()) {
                String flowStagingPath = ((Hadoop2TezFlow) getFlow()).getFlowStagingPath();
                String resourceSubPath = Tap.id(tap);
                Map<Path, Path> pathMap = TezUtil.addToClassPath(current, flowStagingPath, resourceSubPath,
                        paths, LocalResourceType.FILE, taskLocalResources, null);

                current.setStrings(CASCADING_REMOTE_RESOURCES + Tap.id(tap),
                        taskLocalResources.keySet().toArray(new String[taskLocalResources.size()]));

                allLocalResources.putAll(taskLocalResources);
                syncPaths.putAll(pathMap);
            }

            Map<String, String> map = flowProcess.diffConfigIntoMap(new TezConfiguration(conf),
                    new TezConfiguration(current));
            conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf));

            setLocalMode(conf, current, tap);
        }
    }

    Set<FlowElement> sources = new HashSet<>(flowNode.getSourceElements());

    sources.removeAll(accumulatedSources);

    if (sources.isEmpty())
        throw new IllegalStateException("all sources marked as accumulated");

    Map<FlowElement, Configuration> configs = new HashMap<>();

    for (FlowElement element : sources) {
        JobConf current = new JobConf(conf);

        String id = FlowElements.id(element);

        current.set("cascading.node.source", id);

        if (element instanceof Tap) {
            Tap tap = (Tap) element;

            if (tap.getIdentifier() == null)
                throw new IllegalStateException("tap may not have null identifier: " + tap.toString());

            tap.sourceConfInit(flowProcess, current);

            setLocalMode(conf, current, tap);
        }

        configs.put(element, current);
    }

    return configs;
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

protected Map<FlowElement, Configuration> initFromSinks(FlowNode flowNode,
        FlowProcess<? extends Configuration> flowProcess, Configuration conf) {
    Set<FlowElement> sinks = flowNode.getSinkElements();
    Map<FlowElement, Configuration> configs = new HashMap<>();

    for (FlowElement element : sinks) {
        JobConf current = new JobConf(conf);

        if (element instanceof Tap) {
            Tap tap = (Tap) element;//from   w w  w . j  ava2  s . c o  m

            if (tap.getIdentifier() == null)
                throw new IllegalStateException("tap may not have null identifier: " + element.toString());

            tap.sinkConfInit(flowProcess, current);

            setLocalMode(conf, current, tap);
        }

        String id = FlowElements.id(element);

        current.set("cascading.node.sink", id);

        configs.put(element, current);
    }

    return configs;
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

protected void initFromTraps(FlowNode flowNode, FlowProcess<? extends Configuration> flowProcess,
        Configuration conf) {/* w  ww .  j a  v  a 2  s  .c  om*/
    Map<String, Tap> traps = flowNode.getTrapMap();

    if (!traps.isEmpty()) {
        JobConf trapConf = new JobConf(conf);

        for (Tap tap : traps.values()) {
            tap.sinkConfInit(flowProcess, trapConf);
            setLocalMode(conf, trapConf, tap);
        }
    }
}

From source file:cascading.flow.tez.util.TezUtil.java

License:Open Source License

/**
 * Attempting to localize all new JobConf calls
 *
 * @param configuration//from w  w w.  j  av a 2s  .co  m
 * @return
 */
public static JobConf asJobConf(Configuration configuration) {
    return new JobConf(configuration);
}