Example usage for org.apache.hadoop.mapreduce.lib.output MultipleOutputs addNamedOutput

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output MultipleOutputs addNamedOutput.

Prototype

@SuppressWarnings("unchecked")
public static void addNamedOutput(Job job, String namedOutput, Class<? extends OutputFormat> outputFormatClass,
        Class<?> keyClass, Class<?> valueClass)

Source Link

Document

Adds a named output for the job.

Usage

From source file:weka.distributed.hadoop.RandomizedDataChunkHadoopJob.java

License:Open Source License

@Override
public boolean runJob() throws DistributedWekaException {

    boolean success = true;

    ClassLoader orig = Thread.currentThread().getContextClassLoader();
    try {/*from   ww  w .  j a  v a2s.c o  m*/
        Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());

        try {
            if (!m_cleanOutputDir) {
                // check to see if there are files in the output directory. If so,
                // assume that we don't need to run;
                String outputDir = m_mrConfig.getOutputPath() + OUTPUT_SUBDIR;
                Configuration conf = new Configuration();
                m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env);
                FileSystem fs = FileSystem.get(conf);
                Path p = new Path(outputDir + "/chunk0-r-00000");
                if (fs.exists(p)) {
                    if (m_log != null) {
                        statusMessage("Output directory is populated with chunk files - no need to execute");
                        logMessage("Output directory is populated with chunk files - no need to execute");
                    } else {
                        System.err
                                .println("Output directory is populated with chunk files - no need to execute");
                    }
                    return true;
                }
            }

            setJobStatus(JobStatus.RUNNING);

            m_arffHeaderJob.setGenerateCharts(false);
            if (!initializeAndRunArffJob()) {
                return false;
            }

            Instances header = m_arffHeaderJob.getFinalHeader();

            Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(header);
            try {
                WekaClassifierHadoopMapper.setClassIndex(getClassAttribute(), headerNoSummary,
                        !m_dontDefaultToLastAttIfClassNotSet);
            } catch (Exception e) {
                throw new DistributedWekaException(e);
            }

            // a summary attribute for getting the total number of instances
            Attribute summaryAttOrig = null;
            for (int i = 0; i < headerNoSummary.numAttributes(); i++) {
                if (headerNoSummary.attribute(i).isNumeric() || headerNoSummary.attribute(i).isNominal()) {
                    summaryAttOrig = headerNoSummary.attribute(i);
                    break;
                }
            }

            String summaryName = summaryAttOrig.name();
            Attribute summaryAtt = header
                    .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + summaryName);

            if (summaryAtt == null) {
                throw new DistributedWekaException(
                        "Was unable to find the summary attribute for " + "attribute: " + summaryName);
            }

            int totalNumInstances = 0;

            if (summaryAttOrig.isNominal()) {
                NominalStats stats = NominalStats.attributeToStats(summaryAtt);
                for (String label : stats.getLabels()) {
                    totalNumInstances += stats.getCount(label);
                }
            } else {
                NumericStats stats = NumericStats.attributeToStats(summaryAtt);
                totalNumInstances = (int) stats.getStats()[ArffSummaryNumericMetric.COUNT.ordinal()];
            }

            Configuration conf = new Configuration();

            // set randomize/stratify properties
            // add the aggregated ARFF header to the distributed cache
            String pathToHeader = environmentSubstitute(m_arffHeaderJob.getAggregatedHeaderPath());

            HDFSUtils.addFileToDistributedCache(m_mrConfig.getHDFSConfig(), conf, pathToHeader, m_env);
            String fileNameOnly = pathToHeader.substring(pathToHeader.lastIndexOf("/") + 1,
                    pathToHeader.length());

            List<String> randomizeMapOptions = new ArrayList<String>();
            randomizeMapOptions.add("-arff-header");
            randomizeMapOptions.add(fileNameOnly);

            if (!DistributedJobConfig.isEmpty(getClassAttribute())) {
                randomizeMapOptions.add("-class");
                randomizeMapOptions.add(environmentSubstitute(getClassAttribute()));
            }

            if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
                randomizeMapOptions.add("-seed");
                randomizeMapOptions.add(environmentSubstitute(getRandomSeed()));
            }

            if (m_dontDefaultToLastAttIfClassNotSet) {
                randomizeMapOptions.add("-dont-default-class-to-last");
            }

            m_mrConfig.setUserSuppliedProperty(
                    RandomizedDataChunkHadoopMapper.RANDOMIZED_DATA_CHUNK_MAP_TASK_OPTIONS,
                    environmentSubstitute(Utils
                            .joinOptions(randomizeMapOptions.toArray(new String[randomizeMapOptions.size()]))));

            // Need these for row parsing via open-csv
            m_mrConfig.setUserSuppliedProperty(CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS,
                    environmentSubstitute(getCSVMapTaskOptions()));

            int numChunks = 0;
            if (DistributedJobConfig.isEmpty(getNumRandomizedDataChunks())
                    && DistributedJobConfig.isEmpty(getNumInstancesPerRandomizedDataChunk())) {
                throw new DistributedWekaException("Must specify either the number "
                        + "of chunks to create or the number of instances per chunk");
            }

            if (!DistributedJobConfig.isEmpty(getNumRandomizedDataChunks())) {
                try {
                    numChunks = Integer.parseInt(environmentSubstitute(getNumRandomizedDataChunks()));
                } catch (NumberFormatException ex) {
                    throw new DistributedWekaException(ex);
                }
            } else {
                int numInsts = 0;
                try {
                    numInsts = Integer.parseInt(environmentSubstitute(getNumInstancesPerRandomizedDataChunk()));
                } catch (NumberFormatException ex) {
                    throw new DistributedWekaException(ex);
                }

                if (numInsts <= 0) {
                    throw new DistributedWekaException("Number of instances per chunk must be > 0");
                }

                if (numInsts > totalNumInstances) {
                    throw new DistributedWekaException("Can't have more instances per chunk than "
                            + "there are instances in the dataset!");
                }

                double nc = (double) totalNumInstances / numInsts;
                nc = Math.ceil(nc);

                numChunks = (int) nc;
            }

            if (numChunks <= 1) {
                throw new DistributedWekaException("Can't randomize because number of data chunks <= 1");
            }

            m_mrConfig.setUserSuppliedProperty(RandomizedDataChunkHadoopReducer.NUM_DATA_CHUNKS,
                    "" + numChunks);

            // set output path
            String outputPath = m_mrConfig.getOutputPath();
            outputPath += OUTPUT_SUBDIR;
            outputPath = environmentSubstitute(outputPath);
            m_mrConfig.setOutputPath(outputPath);

            // set number of reducers to 1 (otherwise we'll get more
            // chunks than we want!
            m_mrConfig.setNumberOfReducers("1");

            installWekaLibrariesInHDFS(conf);

            Job job = null;
            try {
                job = m_mrConfig.configureForHadoop(
                        "Create randomly shuffled input data chunk job - num chunks: " + numChunks, conf,
                        m_env);
            } catch (ClassNotFoundException e) {
                throw new DistributedWekaException(e);
            }

            // setup multiple outputs
            for (int i = 0; i < numChunks; i++) {
                MultipleOutputs.addNamedOutput(job, "chunk" + i, TextOutputFormat.class, Text.class,
                        Text.class);
            }

            // run the job!
            m_mrConfig.deleteOutputDirectory(job, m_env);

            statusMessage("Submitting randomized data chunk job ");
            logMessage("Submitting randomized data chunk job ");

            success = runJob(job);

            setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED);

            if (!success) {
                statusMessage("Create randomly shuffled input data chunk job failed - check logs on Hadoop");
                logMessage("Create randomly shuffled input data chunk job job failed - check logs on Hadoop");
            } else {
                // need to tidy up in the output directory - for some reason
                // there seems to be a spurious part-r-00000 with size 0 created
                String toDelete = outputPath + "/part-r-00000";
                HDFSUtils.deleteFile(m_mrConfig.getHDFSConfig(), conf, toDelete, m_env);
            }

        } catch (Exception ex) {
            setJobStatus(JobStatus.FAILED);
            throw new DistributedWekaException(ex);
        }
    } finally {
        Thread.currentThread().setContextClassLoader(orig);
    }

    return success;
}