List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:cascading.flow.FlowStep.java
License:Open Source License
private void addComparators(JobConf conf, String property, Map<String, Fields> map) throws IOException { Iterator<Fields> fieldsIterator = map.values().iterator(); if (!fieldsIterator.hasNext()) return;/* w w w . jav a 2 s. c o m*/ Fields fields = fieldsIterator.next(); if (fields.hasComparators()) { conf.set(property, Util.serializeBase64(fields)); return; } // use resolved fields if there are no comparators. Set<Scope> previousScopes = getPreviousScopes(getGroup()); fields = previousScopes.iterator().next().getOutValuesFields(); if (fields.size() != 0) // allows fields.UNKNOWN to be used conf.setInt(property + ".size", fields.size()); return; }
From source file:cascading.flow.hadoop.HadoopFlow.java
License:Open Source License
@Override protected void setConfigProperty(JobConf config, Object key, Object value) { // don't let these objects pass, even though toString is called below. if (value instanceof Class || value instanceof JobConf) return;// w ww.jav a 2 s .c o m config.set(key.toString(), value.toString()); }
From source file:cascading.flow.hadoop.HadoopFlowStep.java
License:Open Source License
public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) { JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig); // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH))); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapRunnerClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(flowProcess, conf);//from w ww. j a v a2s. co m initFromSink(flowProcess, conf); initFromTraps(flowProcess, conf); initFromStepConfigDef(conf); int numSinkParts = getSink().getScheme().getNumSinkParts(); if (numSinkParts != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(numSinkParts); else conf.setNumMapTasks(numSinkParts); } else if (getGroup() != null) { int gatherPartitions = conf.getNumReduceTasks(); if (gatherPartitions == 0) gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0); if (gatherPartitions == 0) throw new FlowException(getName(), "a default number of gather partitions must be set, see FlowRuntimeProps"); conf.setNumReduceTasks(gatherPartitions); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); conf.setPartitionerClass(GroupingPartitioner.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this, getGroup()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingSortingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks String versionString = Version.getRelease(); if (versionString != null) conf.set("cascading.version", versionString); conf.set(CASCADING_FLOW_STEP_ID, getID()); conf.set("cascading.flow.step.num", Integer.toString(getOrdinal())); HadoopUtil.setIsInflow(conf); Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator(); String mapState = pack(iterator.next(), conf); String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf); // hadoop 20.2 doesn't like dist cache when using local mode int maxSize = Short.MAX_VALUE; int length = mapState.length() + reduceState.length(); if (isHadoopLocalMode(conf) || length < maxSize) // seems safe { conf.set("cascading.flow.step.node.map", mapState); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce", reduceState); } else { conf.set("cascading.flow.step.node.map.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState)); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState)); } return conf; }
From source file:cascading.flow.hadoop.HadoopFlowStep.java
License:Open Source License
protected void initFromSources(FlowProcess<JobConf> flowProcess, JobConf conf) { // handles case where same tap is used on multiple branches // we do not want to init the same tap multiple times Set<Tap> uniqueSources = getUniqueStreamedSources(); JobConf[] streamedJobs = new JobConf[uniqueSources.size()]; int i = 0;/*from w w w . jav a2 s . c o m*/ for (Tap tap : uniqueSources) { if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + tap.toString()); streamedJobs[i] = flowProcess.copyConfig(conf); streamedJobs[i].set("cascading.step.source", Tap.id(tap)); tap.sourceConfInit(flowProcess, streamedJobs[i]); i++; } Set<Tap> accumulatedSources = getAllAccumulatedSources(); for (Tap tap : accumulatedSources) { JobConf accumulatedJob = flowProcess.copyConfig(conf); tap.sourceConfInit(flowProcess, accumulatedJob); Map<String, String> map = flowProcess.diffConfigIntoMap(conf, accumulatedJob); conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf)); try { if (DistributedCache.getCacheFiles(accumulatedJob) != null) DistributedCache.setCacheFiles(DistributedCache.getCacheFiles(accumulatedJob), conf); } catch (IOException exception) { throw new CascadingException(exception); } } MultiInputFormat.addInputFormat(conf, streamedJobs); //must come last }
From source file:cascading.flow.hadoop.HadoopUtil.java
License:Open Source License
public static JobConf createJobConf(Map<Object, Object> properties, JobConf defaultJobconf) { JobConf jobConf = defaultJobconf == null ? new JobConf() : new JobConf(defaultJobconf); if (properties == null) return jobConf; Set<Object> keys = new HashSet<Object>(properties.keySet()); // keys will only be grabbed if both key/value are String, so keep orig keys if (properties instanceof Properties) keys.addAll(((Properties) properties).stringPropertyNames()); for (Object key : keys) { Object value = properties.get(key); if (value == null && properties instanceof Properties && key instanceof String) value = ((Properties) properties).getProperty((String) key); if (value == null) // don't stuff null values continue; // don't let these objects pass, even though toString is called below. if (value instanceof Class || value instanceof JobConf) continue; jobConf.set(key.toString(), value.toString()); }// w w w. j av a 2s. c o m return jobConf; }
From source file:cascading.flow.hadoop.planner.HadoopPlanner.java
License:Open Source License
/** * Method copyProperties adds the given Map values to the given JobConf object. * * @param jobConf of type JobConf//from w w w. j av a 2s. c o m * @param properties of type Map */ public static void copyProperties(JobConf jobConf, Map<Object, Object> properties) { if (properties instanceof Properties) { Properties props = (Properties) properties; Set<String> keys = props.stringPropertyNames(); for (String key : keys) jobConf.set(key, props.getProperty(key)); } else { for (Map.Entry<Object, Object> entry : properties.entrySet()) { if (entry.getValue() != null) jobConf.set(entry.getKey().toString(), entry.getValue().toString()); } } }
From source file:cascading.flow.stack.StackElement.java
License:Open Source License
private static TapCollector getTrapCollector(Tap trap, JobConf jobConf) { TapCollector trapCollector = trapCollectors.get(trap); if (trapCollector == null) { try {/*from w w w .j a v a2 s .c o m*/ jobConf = new JobConf(jobConf); int id = jobConf.getInt("cascading.flow.step.id", 0); String partname; if (jobConf.getBoolean("mapred.task.is.map", true)) partname = String.format("-m-%05d-", id); else partname = String.format("-r-%05d-", id); jobConf.set("cascading.tapcollector.partname", "%s%spart" + partname + "%05d"); trapCollector = (TapCollector) trap.openForWrite(jobConf); trapCollectors.put(trap, trapCollector); } catch (IOException exception) { throw new StackException(exception); } } return trapCollector; }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
public Vertex createVertex(FlowProcess<TezConfiguration> flowProcess, TezConfiguration initializedConfig, FlowNode flowNode) {/* ww w . j a v a 2 s .c o m*/ JobConf conf = new JobConf(initializedConfig); addInputOutputMapping(conf, flowNode); conf.setBoolean("mapred.used.genericoptionsparser", true); Map<String, LocalResource> taskLocalResources = new HashMap<>(); Map<FlowElement, Configuration> sourceConfigs = initFromSources(flowNode, flowProcess, conf, taskLocalResources); Map<FlowElement, Configuration> sinkConfigs = initFromSinks(flowNode, flowProcess, conf); initFromTraps(flowNode, flowProcess, conf); initFromNodeConfigDef(flowNode, conf); // force step to local mode if any tap is local setLocalMode(initializedConfig, conf, null); conf.set("cascading.flow.node.num", Integer.toString(flowNode.getOrdinal())); int parallelism = getParallelism(flowNode, conf); if (parallelism == 0) throw new FlowException(getName(), "the default number of gather partitions must be set, see cascading.flow.FlowRuntimeProps"); Vertex vertex = newVertex(flowNode, conf, parallelism); if (!taskLocalResources.isEmpty()) vertex.addTaskLocalFiles(taskLocalResources); for (FlowElement flowElement : sourceConfigs.keySet()) { if (!(flowElement instanceof Tap)) continue; Configuration sourceConf = sourceConfigs.get(flowElement); // not setting the new-api value could result in failures if not set by the Scheme if (sourceConf.get("mapred.mapper.new-api") == null) HadoopUtil.setNewApi(sourceConf, sourceConf.get("mapred.input.format.class", sourceConf.get("mapreduce.job.inputformat.class"))); // unfortunately we cannot just load the input format and set it on the builder with also pulling all other // values out of the configuration. MRInput.MRInputConfigBuilder configBuilder = MRInput.createConfigBuilder(sourceConf, null); // grouping splits loses file name info, breaking partition tap default impl if (flowElement instanceof PartitionTap) // todo: generify configBuilder.groupSplits(false); DataSourceDescriptor dataSourceDescriptor = configBuilder.build(); vertex.addDataSource(FlowElements.id(flowElement), dataSourceDescriptor); } for (FlowElement flowElement : sinkConfigs.keySet()) { if (!(flowElement instanceof Tap)) continue; Configuration sinkConf = sinkConfigs.get(flowElement); Class outputFormatClass; String outputPath; // we have to set sane defaults if not set by the tap // typically the case of MultiSinkTap String formatClassName = sinkConf.get("mapred.output.format.class", sinkConf.get("mapreduce.job.outputformat.class")); if (formatClassName == null) { outputFormatClass = TextOutputFormat.class; // unused, use "new" api, its the default outputPath = Hfs.getTempPath(sinkConf).toString(); // unused } else { outputFormatClass = Util.loadClass(formatClassName); outputPath = getOutputPath(sinkConf); } if (outputPath == null && getOutputPath(sinkConf) == null && isFileOutputFormat(outputFormatClass)) outputPath = Hfs.getTempPath(sinkConf).toString(); // unused MROutput.MROutputConfigBuilder configBuilder = MROutput.createConfigBuilder(sinkConf, outputFormatClass, outputPath); DataSinkDescriptor dataSinkDescriptor = configBuilder.build(); vertex.addDataSink(FlowElements.id(flowElement), dataSinkDescriptor); } addRemoteDebug(flowNode, vertex); addRemoteProfiling(flowNode, vertex); return vertex; }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
private void addInputOutputMapping(JobConf conf, FlowNode flowNode) { FlowNodeGraph flowNodeGraph = getFlowNodeGraph(); Set<ProcessEdge> incomingEdges = flowNodeGraph.incomingEdgesOf(flowNode); for (ProcessEdge processEdge : incomingEdges) conf.set("cascading.node.source." + processEdge.getID(), flowNodeGraph.getEdgeSource(processEdge).getID()); Set<ProcessEdge> outgoingEdges = flowNodeGraph.outgoingEdgesOf(flowNode); for (ProcessEdge processEdge : outgoingEdges) conf.set("cascading.node.sink." + processEdge.getID(), flowNodeGraph.getEdgeTarget(processEdge).getID()); }
From source file:cascading.flow.tez.Hadoop2TezFlowStep.java
License:Open Source License
protected Map<FlowElement, Configuration> initFromSources(FlowNode flowNode, FlowProcess<TezConfiguration> flowProcess, Configuration conf, Map<String, LocalResource> taskLocalResources) { Set<? extends FlowElement> accumulatedSources = flowNode.getSourceElements(StreamMode.Accumulated); for (FlowElement element : accumulatedSources) { if (element instanceof Tap) { JobConf current = new JobConf(conf); Tap tap = (Tap) element;//w w w . ja v a 2 s. c o m if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + tap.toString()); tap.sourceConfInit(flowProcess, current); Collection<String> paths = current.getStringCollection(CASCADING_LOCAL_RESOURCES + Tap.id(tap)); if (!paths.isEmpty()) { String flowStagingPath = ((Hadoop2TezFlow) getFlow()).getFlowStagingPath(); String resourceSubPath = Tap.id(tap); Map<Path, Path> pathMap = TezUtil.addToClassPath(current, flowStagingPath, resourceSubPath, paths, LocalResourceType.FILE, taskLocalResources, null); current.setStrings(CASCADING_REMOTE_RESOURCES + Tap.id(tap), taskLocalResources.keySet().toArray(new String[taskLocalResources.size()])); allLocalResources.putAll(taskLocalResources); syncPaths.putAll(pathMap); } Map<String, String> map = flowProcess.diffConfigIntoMap(new TezConfiguration(conf), new TezConfiguration(current)); conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf)); setLocalMode(conf, current, tap); } } Set<FlowElement> sources = new HashSet<>(flowNode.getSourceElements()); sources.removeAll(accumulatedSources); if (sources.isEmpty()) throw new IllegalStateException("all sources marked as accumulated"); Map<FlowElement, Configuration> configs = new HashMap<>(); for (FlowElement element : sources) { JobConf current = new JobConf(conf); String id = FlowElements.id(element); current.set("cascading.node.source", id); if (element instanceof Tap) { Tap tap = (Tap) element; if (tap.getIdentifier() == null) throw new IllegalStateException("tap may not have null identifier: " + tap.toString()); tap.sourceConfInit(flowProcess, current); setLocalMode(conf, current, tap); } configs.put(element, current); } return configs; }