List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
public static JobConf asJobConfInstance(Configuration configuration) { if (configuration instanceof JobConf) return (JobConf) configuration; return new JobConf(configuration); }
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
public static JobConf copyJobConf(JobConf parentJobConf) { if (parentJobConf == null) throw new IllegalArgumentException("parent may not be null"); // see https://github.com/Cascading/cascading/pull/21 // The JobConf(JobConf) constructor causes derived JobConfs to share Credentials. We want to avoid this, in // case those Credentials are mutated later on down the road (which they will be, during job submission, in // separate threads!). Using the JobConf(Configuration) constructor avoids Credentials-sharing. final Configuration configurationCopy = new Configuration(parentJobConf); final JobConf jobConf = new JobConf(configurationCopy); jobConf.getCredentials().addAll(parentJobConf.getCredentials()); return jobConf; }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testFlow() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);/*from w w w . j av a 2s .c om*/ JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); FileOutputFormat.setOutputPath(conf, new Path(outputPath1)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(flow.openSource(), 10); flow.complete(); validateLength(flow.openSink(), 10); }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testCascade() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);/*from www .j a va 2 s . co m*/ // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true); Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true); Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true))); FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true))); FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); JobConf thirdConf = new JobConf(defaultConf); thirdConf.setJobName("third-mr"); thirdConf.setOutputKeyClass(LongWritable.class); thirdConf.setOutputValueClass(Text.class); thirdConf.setMapperClass(IdentityMapper.class); thirdConf.setReducerClass(IdentityReducer.class); thirdConf.setInputFormat(TextInputFormat.class); thirdConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true))); FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true))); Flow thirdMR = new MapReduceFlow(thirdConf, true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); // cascade.writeDOT( "mrcascade.dot" ); cascade.complete(); validateLength(thirdMR.openSink(), 10); }
From source file:cascading.flow.stack.StackElement.java
License:Open Source License
private static TapCollector getTrapCollector(Tap trap, JobConf jobConf) { TapCollector trapCollector = trapCollectors.get(trap); if (trapCollector == null) { try {/*from w ww. j a v a2s .c o m*/ jobConf = new JobConf(jobConf); int id = jobConf.getInt("cascading.flow.step.id", 0); String partname; if (jobConf.getBoolean("mapred.task.is.map", true)) partname = String.format("-m-%05d-", id); else partname = String.format("-r-%05d-", id); jobConf.set("cascading.tapcollector.partname", "%s%spart" + partname + "%05d"); trapCollector = (TapCollector) trap.openForWrite(jobConf); trapCollectors.put(trap, trapCollector); } catch (IOException exception) { throw new StackException(exception); } } return trapCollector; }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
public Vertex createVertex(FlowProcess<TezConfiguration> flowProcess, TezConfiguration initializedConfig, FlowNode flowNode) {/*w w w.j a v a 2s. c o m*/ JobConf conf = new JobConf(initializedConfig); addInputOutputMapping(conf, flowNode); conf.setBoolean("mapred.used.genericoptionsparser", true); Map<String, LocalResource> taskLocalResources = new HashMap<>(); Map<FlowElement, Configuration> sourceConfigs = initFromSources(flowNode, flowProcess, conf, taskLocalResources); Map<FlowElement, Configuration> sinkConfigs = initFromSinks(flowNode, flowProcess, conf); initFromTraps(flowNode, flowProcess, conf); initFromNodeConfigDef(flowNode, conf); // force step to local mode if any tap is local setLocalMode(initializedConfig, conf, null); conf.set("cascading.flow.node.num", Integer.toString(flowNode.getOrdinal())); int parallelism = getParallelism(flowNode, conf); if (parallelism == 0) throw new FlowException(getName(), "the default number of gather partitions must be set, see cascading.flow.FlowRuntimeProps"); Vertex vertex = newVertex(flowNode, conf, parallelism); if (!taskLocalResources.isEmpty()) vertex.addTaskLocalFiles(taskLocalResources); for (FlowElement flowElement : sourceConfigs.keySet()) { if (!(flowElement instanceof Tap)) continue; Configuration sourceConf = sourceConfigs.get(flowElement); // not setting the new-api value could result in failures if not set by the Scheme if (sourceConf.get("mapred.mapper.new-api") == null) HadoopUtil.setNewApi(sourceConf, sourceConf.get("mapred.input.format.class", sourceConf.get("mapreduce.job.inputformat.class"))); // unfortunately we cannot just load the input format and set it on the builder with also pulling all other // values out of the configuration. MRInput.MRInputConfigBuilder configBuilder = MRInput.createConfigBuilder(sourceConf, null); // grouping splits loses file name info, breaking partition tap default impl if (flowElement instanceof PartitionTap) // todo: generify configBuilder.groupSplits(false); DataSourceDescriptor dataSourceDescriptor = configBuilder.build(); vertex.addDataSource(FlowElements.id(flowElement), dataSourceDescriptor); } for (FlowElement flowElement : sinkConfigs.keySet()) { if (!(flowElement instanceof Tap)) continue; Configuration sinkConf = sinkConfigs.get(flowElement); Class outputFormatClass; String outputPath; // we have to set sane defaults if not set by the tap // typically the case of MultiSinkTap String formatClassName = sinkConf.get("mapred.output.format.class", sinkConf.get("mapreduce.job.outputformat.class")); if (formatClassName == null) { outputFormatClass = TextOutputFormat.class; // unused, use "new" api, its the default outputPath = Hfs.getTempPath(sinkConf).toString(); // unused } else { outputFormatClass = Util.loadClass(formatClassName); outputPath = getOutputPath(sinkConf); } if (outputPath == null && getOutputPath(sinkConf) == null && isFileOutputFormat(outputFormatClass)) outputPath = Hfs.getTempPath(sinkConf).toString(); // unused MROutput.MROutputConfigBuilder configBuilder = MROutput.createConfigBuilder(sinkConf, outputFormatClass, outputPath); DataSinkDescriptor dataSinkDescriptor = configBuilder.build(); vertex.addDataSink(FlowElements.id(flowElement), dataSinkDescriptor); } addRemoteDebug(flowNode, vertex); addRemoteProfiling(flowNode, vertex); return vertex; }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
protected Map<FlowElement, Configuration> initFromSources(FlowNode flowNode, FlowProcess<TezConfiguration> flowProcess, Configuration conf, Map<String, LocalResource> taskLocalResources) { Set<? extends FlowElement> accumulatedSources = flowNode.getSourceElements(StreamMode.Accumulated); for (FlowElement element : accumulatedSources) { if (element instanceof Tap) { JobConf current = new JobConf(conf); Tap tap = (Tap) element;//from ww w . j a v a 2s. co m if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + tap.toString()); tap.sourceConfInit(flowProcess, current); Collection<String> paths = current.getStringCollection(CASCADING_LOCAL_RESOURCES + Tap.id(tap)); if (!paths.isEmpty()) { String flowStagingPath = ((Hadoop2TezFlow) getFlow()).getFlowStagingPath(); String resourceSubPath = Tap.id(tap); Map<Path, Path> pathMap = TezUtil.addToClassPath(current, flowStagingPath, resourceSubPath, paths, LocalResourceType.FILE, taskLocalResources, null); current.setStrings(CASCADING_REMOTE_RESOURCES + Tap.id(tap), taskLocalResources.keySet().toArray(new String[taskLocalResources.size()])); allLocalResources.putAll(taskLocalResources); syncPaths.putAll(pathMap); } Map<String, String> map = flowProcess.diffConfigIntoMap(new TezConfiguration(conf), new TezConfiguration(current)); conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf)); setLocalMode(conf, current, tap); } } Set<FlowElement> sources = new HashSet<>(flowNode.getSourceElements()); sources.removeAll(accumulatedSources); if (sources.isEmpty()) throw new IllegalStateException("all sources marked as accumulated"); Map<FlowElement, Configuration> configs = new HashMap<>(); for (FlowElement element : sources) { JobConf current = new JobConf(conf); String id = FlowElements.id(element); current.set("cascading.node.source", id); if (element instanceof Tap) { Tap tap = (Tap) element; if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + tap.toString()); tap.sourceConfInit(flowProcess, current); setLocalMode(conf, current, tap); } configs.put(element, current); } return configs; }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
protected Map<FlowElement, Configuration> initFromSinks(FlowNode flowNode, FlowProcess<? extends Configuration> flowProcess, Configuration conf) { Set<FlowElement> sinks = flowNode.getSinkElements(); Map<FlowElement, Configuration> configs = new HashMap<>(); for (FlowElement element : sinks) { JobConf current = new JobConf(conf); if (element instanceof Tap) { Tap tap = (Tap) element;//from w w w . j ava2 s . c o m if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + element.toString()); tap.sinkConfInit(flowProcess, current); setLocalMode(conf, current, tap); } String id = FlowElements.id(element); current.set("cascading.node.sink", id); configs.put(element, current); } return configs; }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
protected void initFromTraps(FlowNode flowNode, FlowProcess<? extends Configuration> flowProcess, Configuration conf) {/* w ww . j a v a 2 s .c om*/ Map<String, Tap> traps = flowNode.getTrapMap(); if (!traps.isEmpty()) { JobConf trapConf = new JobConf(conf); for (Tap tap : traps.values()) { tap.sinkConfInit(flowProcess, trapConf); setLocalMode(conf, trapConf, tap); } } }
From source file:cascading.flow.tez.util.TezUtil.java
License:Open Source License
/** * Attempting to localize all new JobConf calls * * @param configuration//from w w w. j av a 2s .co m * @return */ public static JobConf asJobConf(Configuration configuration) { return new JobConf(configuration); }