List of usage examples for org.apache.hadoop.mapreduce.lib.output MultipleOutputs addNamedOutput
@SuppressWarnings("unchecked") public static void addNamedOutput(Job job, String namedOutput, Class<? extends OutputFormat> outputFormatClass, Class<?> keyClass, Class<?> valueClass)
From source file:weka.distributed.hadoop.RandomizedDataChunkHadoopJob.java
License:Open Source License
@Override public boolean runJob() throws DistributedWekaException { boolean success = true; ClassLoader orig = Thread.currentThread().getContextClassLoader(); try {/*from ww w . j a v a2s.c o m*/ Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); try { if (!m_cleanOutputDir) { // check to see if there are files in the output directory. If so, // assume that we don't need to run; String outputDir = m_mrConfig.getOutputPath() + OUTPUT_SUBDIR; Configuration conf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env); FileSystem fs = FileSystem.get(conf); Path p = new Path(outputDir + "/chunk0-r-00000"); if (fs.exists(p)) { if (m_log != null) { statusMessage("Output directory is populated with chunk files - no need to execute"); logMessage("Output directory is populated with chunk files - no need to execute"); } else { System.err .println("Output directory is populated with chunk files - no need to execute"); } return true; } } setJobStatus(JobStatus.RUNNING); m_arffHeaderJob.setGenerateCharts(false); if (!initializeAndRunArffJob()) { return false; } Instances header = m_arffHeaderJob.getFinalHeader(); Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(header); try { WekaClassifierHadoopMapper.setClassIndex(getClassAttribute(), headerNoSummary, !m_dontDefaultToLastAttIfClassNotSet); } catch (Exception e) { throw new DistributedWekaException(e); } // a summary attribute for getting the total number of instances Attribute summaryAttOrig = null; for (int i = 0; i < headerNoSummary.numAttributes(); i++) { if (headerNoSummary.attribute(i).isNumeric() || headerNoSummary.attribute(i).isNominal()) { summaryAttOrig = headerNoSummary.attribute(i); break; } } String summaryName = summaryAttOrig.name(); Attribute summaryAtt = header .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + summaryName); if (summaryAtt == null) { throw new DistributedWekaException( "Was unable to find the summary attribute for " + "attribute: " + summaryName); } int totalNumInstances = 0; if (summaryAttOrig.isNominal()) { NominalStats stats = NominalStats.attributeToStats(summaryAtt); for (String label : stats.getLabels()) { totalNumInstances += stats.getCount(label); } } else { NumericStats stats = NumericStats.attributeToStats(summaryAtt); totalNumInstances = (int) stats.getStats()[ArffSummaryNumericMetric.COUNT.ordinal()]; } Configuration conf = new Configuration(); // set randomize/stratify properties // add the aggregated ARFF header to the distributed cache String pathToHeader = environmentSubstitute(m_arffHeaderJob.getAggregatedHeaderPath()); HDFSUtils.addFileToDistributedCache(m_mrConfig.getHDFSConfig(), conf, pathToHeader, m_env); String fileNameOnly = pathToHeader.substring(pathToHeader.lastIndexOf("/") + 1, pathToHeader.length()); List<String> randomizeMapOptions = new ArrayList<String>(); randomizeMapOptions.add("-arff-header"); randomizeMapOptions.add(fileNameOnly); if (!DistributedJobConfig.isEmpty(getClassAttribute())) { randomizeMapOptions.add("-class"); randomizeMapOptions.add(environmentSubstitute(getClassAttribute())); } if (!DistributedJobConfig.isEmpty(getRandomSeed())) { randomizeMapOptions.add("-seed"); randomizeMapOptions.add(environmentSubstitute(getRandomSeed())); } if (m_dontDefaultToLastAttIfClassNotSet) { randomizeMapOptions.add("-dont-default-class-to-last"); } m_mrConfig.setUserSuppliedProperty( RandomizedDataChunkHadoopMapper.RANDOMIZED_DATA_CHUNK_MAP_TASK_OPTIONS, environmentSubstitute(Utils .joinOptions(randomizeMapOptions.toArray(new String[randomizeMapOptions.size()])))); // Need these for row parsing via open-csv m_mrConfig.setUserSuppliedProperty(CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS, environmentSubstitute(getCSVMapTaskOptions())); int numChunks = 0; if (DistributedJobConfig.isEmpty(getNumRandomizedDataChunks()) && DistributedJobConfig.isEmpty(getNumInstancesPerRandomizedDataChunk())) { throw new DistributedWekaException("Must specify either the number " + "of chunks to create or the number of instances per chunk"); } if (!DistributedJobConfig.isEmpty(getNumRandomizedDataChunks())) { try { numChunks = Integer.parseInt(environmentSubstitute(getNumRandomizedDataChunks())); } catch (NumberFormatException ex) { throw new DistributedWekaException(ex); } } else { int numInsts = 0; try { numInsts = Integer.parseInt(environmentSubstitute(getNumInstancesPerRandomizedDataChunk())); } catch (NumberFormatException ex) { throw new DistributedWekaException(ex); } if (numInsts <= 0) { throw new DistributedWekaException("Number of instances per chunk must be > 0"); } if (numInsts > totalNumInstances) { throw new DistributedWekaException("Can't have more instances per chunk than " + "there are instances in the dataset!"); } double nc = (double) totalNumInstances / numInsts; nc = Math.ceil(nc); numChunks = (int) nc; } if (numChunks <= 1) { throw new DistributedWekaException("Can't randomize because number of data chunks <= 1"); } m_mrConfig.setUserSuppliedProperty(RandomizedDataChunkHadoopReducer.NUM_DATA_CHUNKS, "" + numChunks); // set output path String outputPath = m_mrConfig.getOutputPath(); outputPath += OUTPUT_SUBDIR; outputPath = environmentSubstitute(outputPath); m_mrConfig.setOutputPath(outputPath); // set number of reducers to 1 (otherwise we'll get more // chunks than we want! m_mrConfig.setNumberOfReducers("1"); installWekaLibrariesInHDFS(conf); Job job = null; try { job = m_mrConfig.configureForHadoop( "Create randomly shuffled input data chunk job - num chunks: " + numChunks, conf, m_env); } catch (ClassNotFoundException e) { throw new DistributedWekaException(e); } // setup multiple outputs for (int i = 0; i < numChunks; i++) { MultipleOutputs.addNamedOutput(job, "chunk" + i, TextOutputFormat.class, Text.class, Text.class); } // run the job! m_mrConfig.deleteOutputDirectory(job, m_env); statusMessage("Submitting randomized data chunk job "); logMessage("Submitting randomized data chunk job "); success = runJob(job); setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED); if (!success) { statusMessage("Create randomly shuffled input data chunk job failed - check logs on Hadoop"); logMessage("Create randomly shuffled input data chunk job job failed - check logs on Hadoop"); } else { // need to tidy up in the output directory - for some reason // there seems to be a spurious part-r-00000 with size 0 created String toDelete = outputPath + "/part-r-00000"; HDFSUtils.deleteFile(m_mrConfig.getHDFSConfig(), conf, toDelete, m_env); } } catch (Exception ex) { setJobStatus(JobStatus.FAILED); throw new DistributedWekaException(ex); } } finally { Thread.currentThread().setContextClassLoader(orig); } return success; }