List of usage examples for org.apache.hadoop.mapred JobConf setLong
public void setLong(String name, long value)
name
property to a long
. From source file:org.apache.oozie.action.hadoop.LauncherMapperHelper.java
License:Apache License
public static void setupYarnRestartHandling(JobConf launcherJobConf, Configuration actionConf, String launcherTag, long launcherTime) throws NoSuchAlgorithmException { launcherJobConf.setLong(LauncherMainHadoopUtils.OOZIE_JOB_LAUNCH_TIME, launcherTime); // Tags are limited to 100 chars so we need to hash them to make sure (the actionId otherwise doesn't have a max length) String tag = getTag(launcherTag); // keeping the oozie.child.mapreduce.job.tags instead of mapreduce.job.tags to avoid killing launcher itself. // mapreduce.job.tags should only go to child job launch by launcher. actionConf.set(LauncherMainHadoopUtils.CHILD_MAPREDUCE_JOB_TAGS, tag); }
From source file:org.apache.pig.backend.hadoop.executionengine.mapreduceExec.SliceWrapper.java
License:Apache License
public RecordReader<Text, Tuple> makeReader(JobConf job) throws IOException { lastConf = job;/*from w w w . jav a2 s . c o m*/ DataStorage store = new HDataStorage(ConfigurationUtil.toProperties(job)); store.setActiveContainer(store.asContainer("/user/" + job.getUser())); wrapped.init(store); // Mimic org.apache.hadoop.mapred.FileSplit if feasible... String[] locations = wrapped.getLocations(); if (locations.length > 0) { job.set("map.input.file", locations[0]); job.setLong("map.input.start", wrapped.getStart()); job.setLong("map.input.length", wrapped.getLength()); } return new RecordReader<Text, Tuple>() { public void close() throws IOException { wrapped.close(); } public Text createKey() { return new Text(); } public Tuple createValue() { return new Tuple(); } public long getPos() throws IOException { return wrapped.getPos(); } public float getProgress() throws IOException { return wrapped.getProgress(); } public boolean next(Text key, Tuple value) throws IOException { return wrapped.next(value); } }; }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.SliceWrapper.java
License:Apache License
public RecordReader<Text, Tuple> makeReader(JobConf job) throws IOException { lastConf = job;// ww w . j av a 2 s. c o m DataStorage store = new HDataStorage(ConfigurationUtil.toProperties(job)); // if the execution is against Mapred DFS, set // working dir to /user/<userid> if (pigContext.getExecType() == ExecType.MAPREDUCE) store.setActiveContainer(store.asContainer("/user/" + job.getUser())); wrapped.init(store); job.set("map.target.ops", ObjectSerializer.serialize(targetOps)); // Mimic org.apache.hadoop.mapred.FileSplit if feasible... String[] locations = wrapped.getLocations(); if (locations.length > 0) { job.set("map.input.file", locations[0]); job.setLong("map.input.start", wrapped.getStart()); job.setLong("map.input.length", wrapped.getLength()); } return new RecordReader<Text, Tuple>() { TupleFactory tupFac = TupleFactory.getInstance(); public void close() throws IOException { wrapped.close(); } public Text createKey() { return null; // we never use the key! } public Tuple createValue() { return tupFac.newTuple(); } public long getPos() throws IOException { return wrapped.getPos(); } public float getProgress() throws IOException { return wrapped.getProgress(); } public boolean next(Text key, Tuple value) throws IOException { return wrapped.next(value); } }; }
From source file:org.apache.pig.backend.hadoop.executionengine.tez.TezDagBuilder.java
License:Apache License
private Vertex newVertex(TezOperator tezOp, boolean isMap) throws IOException, ClassNotFoundException, InterruptedException { ProcessorDescriptor procDesc = ProcessorDescriptor.create(tezOp.getProcessorName()); // Pass physical plans to vertex as user payload. JobConf payloadConf = new JobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), false)); // We do this so that dag.getCredentials(), job.getCredentials(), // job.getConfiguration().getCredentials() all reference the same Credentials object // Unfortunately there is no setCredentials() on Job payloadConf.setCredentials(dag.getCredentials()); // We won't actually use this job, but we need it to talk with the Load Store funcs @SuppressWarnings("deprecation") Job job = new Job(payloadConf); payloadConf = (JobConf) job.getConfiguration(); if (tezOp.sampleOperator != null) { payloadConf.set(PigProcessor.SAMPLE_VERTEX, tezOp.sampleOperator.getOperatorKey().toString()); }/*w w w . j a v a 2 s. co m*/ if (tezOp.sortOperator != null) { payloadConf.set(PigProcessor.SORT_VERTEX, tezOp.sortOperator.getOperatorKey().toString()); } String tmp; long maxCombinedSplitSize = 0; if (!tezOp.combineSmallSplits() || pc.getProperties().getProperty(PigConfiguration.PIG_SPLIT_COMBINATION, "true").equals("false")) payloadConf.setBoolean(PigConfiguration.PIG_NO_SPLIT_COMBINATION, true); else if ((tmp = pc.getProperties().getProperty(PigConfiguration.PIG_MAX_COMBINED_SPLIT_SIZE, null)) != null) { try { maxCombinedSplitSize = Long.parseLong(tmp); } catch (NumberFormatException e) { log.warn( "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size"); } } if (maxCombinedSplitSize > 0) payloadConf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize); payloadConf.set("pig.inputs", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInp())); payloadConf.set("pig.inpSignatures", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpSignatureLists())); payloadConf.set("pig.inpLimits", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpLimits())); // Process stores LinkedList<POStore> stores = processStores(tezOp, payloadConf, job); payloadConf.set("pig.pigContext", ObjectSerializer.serialize(pc)); payloadConf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList())); payloadConf.set("exectype", "TEZ"); payloadConf.setBoolean(MRConfiguration.MAPPER_NEW_API, true); payloadConf.setClass(MRConfiguration.INPUTFORMAT_CLASS, PigInputFormat.class, InputFormat.class); // Set parent plan for all operators in the Tez plan. new PhyPlanSetter(tezOp.plan).visit(); // Set the endOfAllInput flag on the physical plan if certain operators that // use this property (such as STREAM) are present in the plan. EndOfAllInputSetter.EndOfAllInputChecker checker = new EndOfAllInputSetter.EndOfAllInputChecker(tezOp.plan); checker.visit(); if (checker.isEndOfAllInputPresent()) { payloadConf.set(JobControlCompiler.END_OF_INP_IN_MAP, "true"); } // Configure the classes for incoming shuffles to this TezOp // TODO: Refactor out resetting input keys, PIG-3957 List<PhysicalOperator> roots = tezOp.plan.getRoots(); if (roots.size() == 1 && roots.get(0) instanceof POPackage) { POPackage pack = (POPackage) roots.get(0); List<PhysicalOperator> succsList = tezOp.plan.getSuccessors(pack); if (succsList != null) { succsList = new ArrayList<PhysicalOperator>(succsList); } byte keyType = pack.getPkgr().getKeyType(); tezOp.plan.remove(pack); payloadConf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); setIntermediateOutputKeyValue(keyType, payloadConf, tezOp); POShuffleTezLoad newPack; newPack = new POShuffleTezLoad(pack); if (tezOp.isSkewedJoin()) { newPack.setSkewedJoins(true); } tezOp.plan.add(newPack); // Set input keys for POShuffleTezLoad. This is used to identify // the inputs that are attached to the POShuffleTezLoad in the // backend. Map<Integer, String> localRearrangeMap = new TreeMap<Integer, String>(); for (TezOperator pred : mPlan.getPredecessors(tezOp)) { if (tezOp.sampleOperator != null && tezOp.sampleOperator == pred) { // skip sample vertex input } else { String inputKey = pred.getOperatorKey().toString(); if (pred.isVertexGroup()) { pred = mPlan.getOperator(pred.getVertexGroupMembers().get(0)); } LinkedList<POLocalRearrangeTez> lrs = PlanHelper.getPhysicalOperators(pred.plan, POLocalRearrangeTez.class); for (POLocalRearrangeTez lr : lrs) { if (lr.isConnectedToPackage() && lr.getOutputKey().equals(tezOp.getOperatorKey().toString())) { localRearrangeMap.put((int) lr.getIndex(), inputKey); } } } } for (Map.Entry<Integer, String> entry : localRearrangeMap.entrySet()) { newPack.addInputKey(entry.getValue()); } if (succsList != null) { for (PhysicalOperator succs : succsList) { tezOp.plan.connect(newPack, succs); } } setIntermediateOutputKeyValue(pack.getPkgr().getKeyType(), payloadConf, tezOp); } else if (roots.size() == 1 && roots.get(0) instanceof POIdentityInOutTez) { POIdentityInOutTez identityInOut = (POIdentityInOutTez) roots.get(0); // TODO Need to fix multiple input key mapping TezOperator identityInOutPred = null; for (TezOperator pred : mPlan.getPredecessors(tezOp)) { if (!pred.isSampleAggregation()) { identityInOutPred = pred; break; } } identityInOut.setInputKey(identityInOutPred.getOperatorKey().toString()); } else if (roots.size() == 1 && roots.get(0) instanceof POValueInputTez) { POValueInputTez valueInput = (POValueInputTez) roots.get(0); LinkedList<String> scalarInputs = new LinkedList<String>(); for (POUserFunc userFunc : PlanHelper.getPhysicalOperators(tezOp.plan, POUserFunc.class)) { if (userFunc.getFunc() instanceof ReadScalarsTez) { scalarInputs.add(((ReadScalarsTez) userFunc.getFunc()).getTezInputs()[0]); } } // Make sure we don't find the scalar for (TezOperator pred : mPlan.getPredecessors(tezOp)) { if (!scalarInputs.contains(pred.getOperatorKey().toString())) { valueInput.setInputKey(pred.getOperatorKey().toString()); break; } } } JobControlCompiler.setOutputFormat(job); // set parent plan in all operators. currently the parent plan is really // used only when POStream, POSplit are present in the plan new PhyPlanSetter(tezOp.plan).visit(); // Serialize the execution plan payloadConf.set(PigProcessor.PLAN, ObjectSerializer.serialize(tezOp.plan)); UDFContext.getUDFContext().serialize(payloadConf); MRToTezHelper.processMRSettings(payloadConf, globalConf); if (!pc.inIllustrator) { for (POStore store : stores) { // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized store.setInputs(null); store.setParentPlan(null); } // We put them in the reduce because PigOutputCommitter checks the // ID of the task to see if it's a map, and if not, calls the reduce // committers. payloadConf.set(JobControlCompiler.PIG_MAP_STORES, ObjectSerializer.serialize(new ArrayList<POStore>())); payloadConf.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(stores)); } if (tezOp.isNeedEstimateParallelism()) { payloadConf.setBoolean(PigProcessor.ESTIMATE_PARALLELISM, true); log.info("Estimate quantile for sample aggregation vertex " + tezOp.getOperatorKey().toString()); } // Take our assembled configuration and create a vertex UserPayload userPayload = TezUtils.createUserPayloadFromConf(payloadConf); procDesc.setUserPayload(userPayload); Vertex vertex = Vertex.create(tezOp.getOperatorKey().toString(), procDesc, tezOp.getVertexParallelism(), isMap ? MRHelpers.getResourceForMRMapper(globalConf) : MRHelpers.getResourceForMRReducer(globalConf)); Map<String, String> taskEnv = new HashMap<String, String>(); MRHelpers.updateEnvBasedOnMRTaskEnv(globalConf, taskEnv, isMap); vertex.setTaskEnvironment(taskEnv); // All these classes are @InterfaceAudience.Private in Hadoop. Switch to Tez methods in TEZ-1012 // set the timestamps, public/private visibility of the archives and files ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(globalConf); // get DelegationToken for each cached file ClientDistributedCacheManager.getDelegationTokens(globalConf, job.getCredentials()); MRApps.setupDistributedCache(globalConf, localResources); vertex.addTaskLocalFiles(localResources); vertex.setTaskLaunchCmdOpts(isMap ? MRHelpers.getJavaOptsForMRMapper(globalConf) : MRHelpers.getJavaOptsForMRReducer(globalConf)); log.info("For vertex - " + tezOp.getOperatorKey().toString() + ": parallelism=" + tezOp.getVertexParallelism() + ", memory=" + vertex.getTaskResource().getMemory() + ", java opts=" + vertex.getTaskLaunchCmdOpts()); // Right now there can only be one of each of these. Will need to be // more generic when there can be more. for (POLoad ld : tezOp.getLoaderInfo().getLoads()) { // TODO: These should get the globalConf, or a merged version that // keeps settings like pig.maxCombinedSplitSize vertex.setLocationHint( VertexLocationHint.create(tezOp.getLoaderInfo().getInputSplitInfo().getTaskLocationHints())); vertex.addDataSource(ld.getOperatorKey().toString(), DataSourceDescriptor.create( InputDescriptor.create(MRInput.class.getName()) .setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder() .setConfigurationBytes(TezUtils.createByteStringFromConf(payloadConf)) .setSplits(tezOp.getLoaderInfo().getInputSplitInfo().getSplitsProto()).build() .toByteString().asReadOnlyByteBuffer())), InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName()), dag.getCredentials())); } for (POStore store : stores) { ArrayList<POStore> emptyList = new ArrayList<POStore>(); ArrayList<POStore> singleStore = new ArrayList<POStore>(); singleStore.add(store); Configuration outputPayLoad = new Configuration(payloadConf); outputPayLoad.set(JobControlCompiler.PIG_MAP_STORES, ObjectSerializer.serialize(emptyList)); outputPayLoad.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(singleStore)); OutputDescriptor storeOutDescriptor = OutputDescriptor.create(MROutput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(outputPayLoad)); if (tezOp.getVertexGroupStores() != null) { OperatorKey vertexGroupKey = tezOp.getVertexGroupStores().get(store.getOperatorKey()); if (vertexGroupKey != null) { getPlan().getOperator(vertexGroupKey).getVertexGroupInfo() .setStoreOutputDescriptor(storeOutDescriptor); continue; } } vertex.addDataSink(store.getOperatorKey().toString(), new DataSinkDescriptor(storeOutDescriptor, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), dag.getCredentials())); } // LoadFunc and StoreFunc add delegation tokens to Job Credentials in // setLocation and setStoreLocation respectively. For eg: HBaseStorage // InputFormat add delegation token in getSplits and OutputFormat in // checkOutputSpecs. For eg: FileInputFormat and FileOutputFormat if (stores.size() > 0) { new PigOutputFormat().checkOutputSpecs(job); } // Set the right VertexManagerPlugin if (tezOp.getEstimatedParallelism() != -1) { if (tezOp.isGlobalSort() || tezOp.isSkewedJoin()) { // Set VertexManagerPlugin to PartitionerDefinedVertexManager, which is able // to decrease/increase parallelism of sorting vertex dynamically // based on the numQuantiles calculated by sample aggregation vertex vertex.setVertexManagerPlugin( VertexManagerPluginDescriptor.create(PartitionerDefinedVertexManager.class.getName())); log.info("Set VertexManagerPlugin to PartitionerDefinedParallelismVertexManager for vertex " + tezOp.getOperatorKey().toString()); } else { boolean containScatterGather = false; boolean containCustomPartitioner = false; for (TezEdgeDescriptor edge : tezOp.inEdges.values()) { if (edge.dataMovementType == DataMovementType.SCATTER_GATHER) { containScatterGather = true; } if (edge.partitionerClass != null) { containCustomPartitioner = true; } } if (containScatterGather && !containCustomPartitioner) { // Use auto-parallelism feature of ShuffleVertexManager to dynamically // reduce the parallelism of the vertex VertexManagerPluginDescriptor vmPluginDescriptor = VertexManagerPluginDescriptor .create(ShuffleVertexManager.class.getName()); Configuration vmPluginConf = ConfigurationUtil.toConfiguration(pc.getProperties(), false); vmPluginConf.setBoolean(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, true); if (vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) != InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) { vmPluginConf.setLong( ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE, vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER)); } vmPluginDescriptor.setUserPayload(TezUtils.createUserPayloadFromConf(vmPluginConf)); vertex.setVertexManagerPlugin(vmPluginDescriptor); log.info("Set auto parallelism for vertex " + tezOp.getOperatorKey().toString()); } } } // Reset udfcontext jobconf. It is not supposed to be set in the front end UDFContext.getUDFContext().addJobConf(null); return vertex; }
From source file:org.apache.sysml.runtime.matrix.data.hadoopfix.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); InputSplit inputSplit = taggedInputSplit.getInputSplit(); if (inputSplit instanceof FileSplit) { FileSplit fileSplit = (FileSplit) inputSplit; conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString()); conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart()); conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength()); }//from w w w. j a v a 2s .co m return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java
License:Apache License
public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception { if (inputs.length != inputInfos.length) throw new Exception("number of inputs and inputInfos does not match"); //set up names of the input matrices and their inputformat information job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs); MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes); //set up converter infos (converter determined implicitly) if (setConverter) { for (int i = 0; i < inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target); }/*from w w w. j av a 2 s .c om*/ //remove redundant inputs and pure broadcast variables ArrayList<Path> lpaths = new ArrayList<>(); ArrayList<InputInfo> liinfos = new ArrayList<>(); for (int i = 0; i < inputs.length; i++) { Path p = new Path(inputs[i]); //check and skip redundant inputs if (lpaths.contains(p) //path already included || distCacheOnly[i]) //input only required in dist cache { continue; } lpaths.add(p); liinfos.add(inputInfos[i]); } boolean combineInputFormat = false; if (OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT) { //determine total input sizes double totalInputSize = 0; for (int i = 0; i < inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i])); //set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough, //(2) degree of parallelism not hurt, and only a single input (except broadcasts) //(the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers) //(the single input constraint stems from internal runtime assumptions used to relate meta data to inputs) long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer(); long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize(); long newSplitSize = sizeHDFSBlk * 2; //use generic config api for backwards compatibility double spillPercent = Double .parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0")); int numPMap = OptimizerUtils.getNumMappers(); if (numPMap < totalInputSize / newSplitSize && sizeSortBuff * spillPercent >= newSplitSize && lpaths.size() == 1) { job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize); combineInputFormat = true; } } //add inputs to jobs input (incl input format configuration) for (int i = 0; i < lpaths.size(); i++) { //add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency) if (combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo) MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class); else MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass); } }
From source file:org.apache.sysml.runtime.matrix.sort.PickFromCompactInputFormat.java
License:Apache License
public static void setZeroValues(JobConf job, MetaDataNumItemsByEachReducer metadata) { job.setInt(PARTITION_OF_ZERO, metadata.getPartitionOfZero()); job.setLong(NUMBER_OF_ZERO, metadata.getNumberOfZero()); }
From source file:org.apache.sysml.runtime.transform.ApplyTfBBMR.java
License:Apache License
public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception { CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst); long[] rlens = new long[] { numRows }; long[] clens = new long[] { numColsAfter }; int[] brlens = new int[] { rblk.brlen }; int[] bclens = new int[] { rblk.bclen }; byte[] realIndexes = new byte[] { rblk.input }; byte[] resultIndexes = new byte[] { rblk.output }; JobConf job = new JobConf(ApplyTfBBMR.class); job.setJobName("ApplyTfBB"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfBBMR.class); // set relevant classes job.setMapperClass(ApplyTfBBMapper.class); MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL); MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); MRJobConfiguration.setCSVReblockInstructions(job, rblkInst); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInst); job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false); //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getNumReducers(), ret.numReducerGroups);//from w w w . j av a2s .co m job.setNumReduceTasks(numRed); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false); // configure mapper and the mapper output key value pairs job.setMapperClass(ApplyTfBBMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(new Path(partOffsetsFile), "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); // Adding "dummy" string to handle the case of na_strings = "" if (inputDataProperties.getNAStrings() != null) job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC, spec); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job); Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } return new JobReturn(ret.stats, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.transform.ApplyTfCSVMR.java
License:Apache License
public static JobReturn runJob(String inputPath, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(ApplyTfCSVMR.class); job.setJobName("ApplyTfCSV"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfCSVMR.class); // set relevant classes job.setMapperClass(ApplyTfCSVMapper.class); job.setNumReduceTasks(0);/*from w w w . j a va 2s.c om*/ // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(partOffsetsFile); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(outputPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC, spec); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); // Since transform CSV produces part files w/ prefix transform-part-*, // delete all the "default" part-..... files deletePartFiles(fs, outPath); MatrixCharacteristics mc = new MatrixCharacteristics(); return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.transform.GenTfMtdMR.java
License:Apache License
public static long runJob(String inputPath, String txMtdPath, String specWithIDs, String smallestFile, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(GenTfMtdMR.class); job.setJobName("GenTfMTD"); /* Setup MapReduce Job */ job.setJarByClass(GenTfMtdMR.class); // set relevant classes job.setMapperClass(GTFMTDMapper.class); job.setReducerClass(GTFMTDReducer.class); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DistinctValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(txMtdPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true);// w ww . j ava2s . co m FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC, specWithIDs); job.set(MRJobConfiguration.TF_SMALLEST_FILE, smallestFile); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, txMtdPath); // offsets file to store part-file names and offsets for each input split job.set(MRJobConfiguration.TF_OFFSETS_FILE, partOffsetsFile); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); Counters c = runjob.getCounters(); long tx_numRows = c.findCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS).getCounter(); return tx_numRows; }