Example usage for org.apache.hadoop.mapred JobConf setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> theClass)

Source Link

Document

Set the user-defined combiner class used to combine map-outputs before being sent to the reducers.

Usage

From source file:org.asayler.WikiTitleCount.java

License:Apache License

/**
 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//from  ww  w  .j  av  a 2 s.  c o m
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleCount.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    int num_reducers = 1;

    conf.setJobName("wikititlecount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    /** Set Default Mappers */
    num_reducers = (int) (cluster.getMaxReduceTasks() * 0.9);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            other_args.add(args[i]);
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */
    conf.setNumMapTasks(num_maps);
    conf.setNumReduceTasks(num_reducers);

    JobClient.runJob(conf);
    return 0;
}

From source file:org.hadoop.tdg.MaxTemperatureDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[0]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    JobClient.runJob(conf);/*from  www.  j  a  v a2s . c  o  m*/
    return 0;
}

From source file:org.mitre.bio.mapred.TotalSequenceLength.java

License:Open Source License

/**
 * Init the job with the given parameters and run it.
 *
 * @param jobConf   the hadoop job configuration
 * @param input     input {@link SequenceFile} path
 * @param output    output path (this will contain ONE part with the length)
 * @return zero if successful//w ww.  java  2 s  .  c  o  m
 * @throws java.lang.Exception
 */
public int initJob(JobConf jobConf, String input, String output, boolean cleanLogs) throws Exception {
    JobConf conf = new JobConf(jobConf, TotalSequenceLength.class);
    conf.setJobName("TotalSequenceLength");

    // We can only handle one reducer
    if (conf.getNumReduceTasks() != 1) {
        conf.setNumReduceTasks(1);
        LOG.info("Setting number of reducers to ONE!");
    }

    SequenceFileInputFormat.setInputPaths(conf, new Path(input));
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapperClass(SequenceMapClass.class);
    conf.setOutputKeyClass(IntWritable.class); // map output key class
    conf.setOutputValueClass(IntWritable.class); // map output value class

    conf.setCombinerClass(LengthReduceClass.class);
    conf.setReducerClass(LengthReduceClass.class);
    FileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);

    if (cleanLogs) {
        LOG.info("removing log directory");
        Path path = new Path(output, "_logs");
        FileSystem fs = path.getFileSystem(jobConf);
        fs.delete(path, true);
    }

    return 0;
}

From source file:org.mitre.ccv.mapred.CalculateKmerCounts.java

License:Open Source License

/**
 * Start up a map-reduce job with the given parameters.
 *
 * <P>Setting the system property "kmer.count.parent.fast.map" will result in this using a {@link java.util.Map}
 * to speed up the output of kmers at the expense of memory.
 *
 * @param jobConf//from ww w  . j av  a2  s.  c  om
 * @param start     starting window size
 * @param end       ending window size
 * @param input
 * @param output
 * @return
 * @throws java.lang.Exception
 */
public int initJob(JobConf jobConf, int start, int end, String input, String output) throws Exception {
    JobConf conf = new JobConf(jobConf, CalculateKmerCounts.class);
    conf.setJobName("CalculateKmerCounts");

    if (start <= 2)
        throw new IllegalArgumentException("Value of 'start' argument must be larger than 2");

    // Save our window size so that the tasks have access to them
    conf.set(START, Integer.toString(start));
    conf.set(END, Integer.toString(end));
    //conf.set(FAST_MAP, fastMap ? "Y":"N");

    // Set up mapper
    SequenceFileInputFormat.setInputPaths(conf, new Path(input));
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapperClass(KmerCountMap.class);
    conf.setOutputKeyClass(Text.class); // map output key class
    conf.setOutputValueClass(KmerCountWritable.class); // map output value class

    // Set up combiner/reducer
    conf.setCombinerClass(KmerCountReducer.class);
    conf.setReducerClass(KmerCountReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);

    return 0;
}

From source file:org.mitre.ccv.mapred.CalculateKmerRevisedRelativeEntropy.java

License:Open Source License

/**
 * Start up a map-reduce job with the given parameters.
 *
 * @param jobConf//w  w  w  .  j av a  2  s .c  o  m
 * @param start     starting window size
 * @param end       ending window size
 * @param input
 * @param output
 * @return
 * @throws java.lang.Exception
 */
public int initJob(JobConf jobConf, String globalInput, String cvInput, String output, boolean cleanLogs)
        throws Exception {
    JobConf conf = new JobConf(jobConf, CalculateKmerRevisedRelativeEntropy.class);
    conf.setJobName("CalculateKmerRevisedRelativeEntropy");

    /** 
     * Set up paths
     */
    String ts = FileUtils.getSimpleDate();
    String cvOutput = output + "_" + ts + COMPOSITION_VECTORS_KMER_POSTFIX;

    /** commaSeparatedPaths */
    String mergedInput = cvOutput + "," + globalInput;

    /** merged output */
    String mergedOutput = output + "_" + ts + MERGED_KMER_POSTFIX;

    /**
     * First, map all the CompositionVector's k-mers to Text as keys and
     * local k-mer/value pairs (KmerPiValuePairWritables) as values.
     */
    JobConf subConf = new JobConf(conf);
    subConf.setJobName("CalculateKmerRevisedRelativeEntropy-CompositionVectors");
    // setup mapper
    SequenceFileInputFormat.setInputPaths(subConf, cvInput);
    subConf.setInputFormat(SequenceFileInputFormat.class);
    subConf.setMapperClass(CompositionVectorMap.class);
    subConf.setOutputKeyClass(Text.class); // job output key class
    subConf.setOutputValueClass(StringDoublePairWritable.class); // job output value class

    // Uses default reducer (IdentityReducer)
    subConf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(subConf, new Path(cvOutput));
    LOG.info("Converting CompositionVectors to k-mer/pi-value pairs.");
    JobClient.runJob(subConf);

    /**
     * Second, map (merge) all the k-mer/pi-value pairs together in an
     * array of values (KmerPiValueArrayWritables).
     */
    subConf = new JobConf(conf);
    subConf.setJobName("CalculateKmerRevisedRelativeEntropy-Merging");
    // setup mapper
    SequenceFileInputFormat.setInputPaths(subConf, mergedInput);
    subConf.setInputFormat(SequenceFileInputFormat.class);
    subConf.setMapperClass(MergeMap.class);
    subConf.setOutputKeyClass(Text.class);
    subConf.setOutputValueClass(KmerPiValueArrayWritable.class);

    // setup combiner/reducer
    subConf.setCombinerClass(MergeReducer.class);
    subConf.setReducerClass(MergeReducer.class);
    subConf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(subConf, new Path(mergedOutput));
    LOG.info("Merging k-mers/pi-values from CompositionVectors and all sequences (global)");
    JobClient.runJob(subConf);

    /**
     * Third, calculate entropies (map-reduce)
     */
    subConf = new JobConf(conf);
    subConf.setJobName("CalculateKmerRevisedRelativeEntropy-RRE");
    // setup mapper
    SequenceFileInputFormat.setInputPaths(subConf, mergedOutput);
    subConf.setInputFormat(SequenceFileInputFormat.class);
    subConf.setMapperClass(EntropyMap.class);
    subConf.setOutputKeyClass(Text.class);
    subConf.setOutputValueClass(KmerEntropyPairWritable.class);

    // Setup Combiner and Reducer
    subConf.setCombinerClass(EntropyCombiner.class);
    subConf.setReducerClass(EntropyReducer.class);
    if (conf.getBoolean(TEXT_OUTPUT, false)) {
        FileOutputFormat.setOutputPath(subConf, new Path(output));
    } else {
        subConf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(subConf, new Path(output));
    }

    LOG.info("Calculating entropies");
    JobClient.runJob(subConf);

    /**
     * Remove tmp directories
     */
    Path tmp = new Path(cvOutput);
    FileSystem fs = tmp.getFileSystem(conf);
    fs.delete(tmp, true);
    tmp = new Path(mergedOutput);
    fs.delete(tmp, true);

    return 0;
}

From source file:org.mitre.ccv.mapred.InvertKmerProbabilities.java

License:Open Source License

/**
 * Start up the job with the given parameters.
 * //  w  ww. j  a  v a  2 s . c  o m
 * @param jobConf       The {@link JobConf} to use
 * @param input         path to the {@link SequenceFile}s
 * @param output        path to save the output
 * @param cleanLogs     if <code>true</code> remove the logs
 * @return
 * @throws java.lang.Exception
 */
public int initJob(JobConf jobConf, String input, String output, boolean cleanLogs) throws Exception {
    JobConf conf = new JobConf(jobConf, InvertKmerProbabilities.class);
    conf.setJobName("InvertKmerFrequencies");

    // Set up mapper
    SequenceFileInputFormat.setInputPaths(conf, new Path(input));
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapperClass(InverterMapper.class);
    conf.setOutputKeyClass(Text.class); // final output key class
    conf.setOutputValueClass(KmerProbabilityMapWritable.class); // final output value class

    // Set up combiner/reducer
    conf.setCombinerClass(InverterReducer.class);
    conf.setReducerClass(InverterReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);

    if (cleanLogs) {
        LOG.info("removing log directory");
        Path path = new Path(output, "_logs");
        FileSystem fs = path.getFileSystem(jobConf);
        fs.delete(path, true);
    }
    return 0;
}

From source file:org.pentaho.di.job.entries.hadoopjobexecutor.JobEntryHadoopJobExecutor.java

License:Apache License

public Result execute(Result result, int arg1) throws KettleException {
    result.setNrErrors(0);//ww  w  .  j  a va 2s .com

    Log4jFileAppender appender = null;
    String logFileName = "pdi-" + this.getName(); //$NON-NLS-1$

    String hadoopDistro = System.getProperty("hadoop.distribution.name", hadoopDistribution);
    hadoopDistro = environmentSubstitute(hadoopDistro);
    if (Const.isEmpty(hadoopDistro)) {
        hadoopDistro = "generic";
    }

    try {
        appender = LogWriter.createFileAppender(logFileName, true, false);
        LogWriter.getInstance().addAppender(appender);
        log.setLogLevel(parentJob.getLogLevel());
    } catch (Exception e) {
        logError(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.FailedToOpenLogFile", logFileName, //$NON-NLS-1$
                e.toString()));
        logError(Const.getStackTracker(e));
    }

    try {
        URL resolvedJarUrl = null;
        String jarUrlS = environmentSubstitute(jarUrl);
        if (jarUrlS.indexOf("://") == -1) {
            // default to file://
            File jarFile = new File(jarUrlS);
            resolvedJarUrl = jarFile.toURI().toURL();
        } else {
            resolvedJarUrl = new URL(jarUrlS);
        }

        final String cmdLineArgsS = environmentSubstitute(cmdLineArgs);

        if (log.isDetailed())
            logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.ResolvedJar",
                    resolvedJarUrl.toExternalForm()));

        if (isSimple) {
            /*      final AtomicInteger taskCount = new AtomicInteger(0);
                  final AtomicInteger successCount = new AtomicInteger(0);
                  final AtomicInteger failedCount = new AtomicInteger(0); */

            if (log.isDetailed())
                logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.SimpleMode"));
            List<Class<?>> classesWithMains = JarUtility
                    .getClassesInJarWithMain(resolvedJarUrl.toExternalForm(), getClass().getClassLoader());
            for (final Class<?> clazz : classesWithMains) {
                Runnable r = new Runnable() {
                    public void run() {
                        try {
                            final ClassLoader cl = Thread.currentThread().getContextClassLoader();
                            try {
                                //                  taskCount.incrementAndGet();
                                Thread.currentThread().setContextClassLoader(clazz.getClassLoader());
                                Method mainMethod = clazz.getMethod("main", new Class[] { String[].class });
                                Object[] args = (cmdLineArgsS != null)
                                        ? new Object[] { cmdLineArgsS.split(" ") }
                                        : new Object[0];
                                mainMethod.invoke(null, args);
                            } finally {
                                Thread.currentThread().setContextClassLoader(cl);
                                //                  successCount.incrementAndGet();
                                //                  taskCount.decrementAndGet();
                            }
                        } catch (Throwable ignored) {
                            // skip, try the next one
                            //                logError(ignored.getMessage());
                            //                failedCount.incrementAndGet();
                            ignored.printStackTrace();
                        }
                    }
                };
                Thread t = new Thread(r);
                t.start();
            }

            // uncomment to implement blocking
            /* if (blocking) {
              while (taskCount.get() > 0 && !parentJob.isStopped()) {
                Thread.sleep(1000);
              }
                    
              if (!parentJob.isStopped()) {
                result.setResult(successCount.get() > 0);
                result.setNrErrors((successCount.get() > 0) ? 0 : 1);
              } else {
                // we can't really know at this stage if 
                // the hadoop job will finish successfully 
                // because we have to stop now
                result.setResult(true); // look on the bright side of life :-)...
                result.setNrErrors(0);
              }
            } else { */
            // non-blocking - just set success equal to no failures arising
            // from invocation
            //          result.setResult(failedCount.get() == 0);
            //          result.setNrErrors(failedCount.get());
            result.setResult(true);
            result.setNrErrors(0);
            /* } */
        } else {
            if (log.isDetailed())
                logDetailed(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.AdvancedMode"));

            URL[] urls = new URL[] { resolvedJarUrl };
            URLClassLoader loader = new URLClassLoader(urls, getClass().getClassLoader());

            JobConf conf = new JobConf();
            String hadoopJobNameS = environmentSubstitute(hadoopJobName);
            conf.setJobName(hadoopJobNameS);

            String outputKeyClassS = environmentSubstitute(outputKeyClass);
            conf.setOutputKeyClass(loader.loadClass(outputKeyClassS));
            String outputValueClassS = environmentSubstitute(outputValueClass);
            conf.setOutputValueClass(loader.loadClass(outputValueClassS));

            if (mapperClass != null) {
                String mapperClassS = environmentSubstitute(mapperClass);
                Class<? extends Mapper> mapper = (Class<? extends Mapper>) loader.loadClass(mapperClassS);
                conf.setMapperClass(mapper);
            }
            if (combinerClass != null) {
                String combinerClassS = environmentSubstitute(combinerClass);
                Class<? extends Reducer> combiner = (Class<? extends Reducer>) loader.loadClass(combinerClassS);
                conf.setCombinerClass(combiner);
            }
            if (reducerClass != null) {
                String reducerClassS = environmentSubstitute(reducerClass);
                Class<? extends Reducer> reducer = (Class<? extends Reducer>) loader.loadClass(reducerClassS);
                conf.setReducerClass(reducer);
            }

            if (inputFormatClass != null) {
                String inputFormatClassS = environmentSubstitute(inputFormatClass);
                Class<? extends InputFormat> inputFormat = (Class<? extends InputFormat>) loader
                        .loadClass(inputFormatClassS);
                conf.setInputFormat(inputFormat);
            }
            if (outputFormatClass != null) {
                String outputFormatClassS = environmentSubstitute(outputFormatClass);
                Class<? extends OutputFormat> outputFormat = (Class<? extends OutputFormat>) loader
                        .loadClass(outputFormatClassS);
                conf.setOutputFormat(outputFormat);
            }

            String hdfsHostnameS = environmentSubstitute(hdfsHostname);
            String hdfsPortS = environmentSubstitute(hdfsPort);
            String jobTrackerHostnameS = environmentSubstitute(jobTrackerHostname);
            String jobTrackerPortS = environmentSubstitute(jobTrackerPort);

            // See if we can auto detect the distribution first
            HadoopConfigurer configurer = HadoopConfigurerFactory.locateConfigurer();

            if (configurer == null) {
                // go with what has been selected by the user
                configurer = HadoopConfigurerFactory.getConfigurer(hadoopDistro);

                // if the user-specified distribution is detectable, make sure it is still
                // the current distribution!
                if (configurer != null && configurer.isDetectable()) {
                    if (!configurer.isAvailable()) {
                        throw new KettleException(BaseMessages.getString(PKG,
                                "JobEntryHadoopJobExecutor.Error.DistroNoLongerPresent",
                                configurer.distributionName()));
                    }
                }
            }
            if (configurer == null) {
                throw new KettleException(BaseMessages.getString(PKG,
                        "JobEntryHadoopJobExecutor.Error.UnknownHadoopDistribution", hadoopDistro));
            }
            logBasic(BaseMessages.getString(PKG, "JobEntryHadoopJobExecutor.Message.DistroConfigMessage",
                    configurer.distributionName()));

            List<String> configMessages = new ArrayList<String>();
            configurer.configure(hdfsHostnameS, hdfsPortS, jobTrackerHostnameS, jobTrackerPortS, conf,
                    configMessages);
            for (String m : configMessages) {
                logBasic(m);
            }

            String inputPathS = environmentSubstitute(inputPath);
            String[] inputPathParts = inputPathS.split(",");
            List<Path> paths = new ArrayList<Path>();
            for (String path : inputPathParts) {
                paths.add(new Path(configurer.getFilesystemURL() + path));
            }
            Path[] finalPaths = paths.toArray(new Path[paths.size()]);

            //FileInputFormat.setInputPaths(conf, new Path(configurer.getFilesystemURL() + inputPathS));
            FileInputFormat.setInputPaths(conf, finalPaths);
            String outputPathS = environmentSubstitute(outputPath);
            FileOutputFormat.setOutputPath(conf, new Path(configurer.getFilesystemURL() + outputPathS));

            // process user defined values
            for (UserDefinedItem item : userDefined) {
                if (item.getName() != null && !"".equals(item.getName()) && item.getValue() != null
                        && !"".equals(item.getValue())) {
                    String nameS = environmentSubstitute(item.getName());
                    String valueS = environmentSubstitute(item.getValue());
                    conf.set(nameS, valueS);
                }
            }

            String workingDirectoryS = environmentSubstitute(workingDirectory);
            conf.setWorkingDirectory(new Path(configurer.getFilesystemURL() + workingDirectoryS));
            conf.setJar(jarUrl);

            String numMapTasksS = environmentSubstitute(numMapTasks);
            String numReduceTasksS = environmentSubstitute(numReduceTasks);
            int numM = 1;
            try {
                numM = Integer.parseInt(numMapTasksS);
            } catch (NumberFormatException e) {
                logError("Can't parse number of map tasks '" + numMapTasksS + "'. Setting num"
                        + "map tasks to 1");
            }
            int numR = 1;
            try {
                numR = Integer.parseInt(numReduceTasksS);
            } catch (NumberFormatException e) {
                logError("Can't parse number of reduce tasks '" + numReduceTasksS + "'. Setting num"
                        + "reduce tasks to 1");
            }

            conf.setNumMapTasks(numM);
            conf.setNumReduceTasks(numR);

            JobClient jobClient = new JobClient(conf);
            RunningJob runningJob = jobClient.submitJob(conf);

            String loggingIntervalS = environmentSubstitute(loggingInterval);
            int logIntv = 60;
            try {
                logIntv = Integer.parseInt(loggingIntervalS);
            } catch (NumberFormatException e) {
                logError("Can't parse logging interval '" + loggingIntervalS + "'. Setting "
                        + "logging interval to 60");
            }
            if (blocking) {
                try {
                    int taskCompletionEventIndex = 0;
                    while (!parentJob.isStopped() && !runningJob.isComplete()) {
                        if (logIntv >= 1) {
                            printJobStatus(runningJob);
                            taskCompletionEventIndex = logTaskMessages(runningJob, taskCompletionEventIndex);
                            Thread.sleep(logIntv * 1000);
                        } else {
                            Thread.sleep(60000);
                        }
                    }

                    if (parentJob.isStopped() && !runningJob.isComplete()) {
                        // We must stop the job running on Hadoop
                        runningJob.killJob();
                        // Indicate this job entry did not complete
                        result.setResult(false);
                    }

                    printJobStatus(runningJob);
                    // Log any messages we may have missed while polling
                    logTaskMessages(runningJob, taskCompletionEventIndex);
                } catch (InterruptedException ie) {
                    logError(ie.getMessage(), ie);
                }

                // Entry is successful if the MR job is successful overall
                result.setResult(runningJob.isSuccessful());
            }

        }
    } catch (Throwable t) {
        t.printStackTrace();
        result.setStopped(true);
        result.setNrErrors(1);
        result.setResult(false);
        logError(t.getMessage(), t);
    }

    if (appender != null) {
        LogWriter.getInstance().removeAppender(appender);
        appender.close();

        ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_LOG, appender.getFile(),
                parentJob.getJobname(), getName());
        result.getResultFiles().put(resultFile.getFile().toString(), resultFile);
    }

    return result;
}

From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java

License:Open Source License

public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile,
        String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort)
        throws IOException, KettleException {

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    KettleEnvironment.init();/*from  w  ww.j  a v a 2 s  .c o  m*/

    // Register Map/Reduce Input and Map/Reduce Output plugin steps
    PluginMainClassType mainClassTypesAnnotation = StepPluginType.class
            .getAnnotation(PluginMainClassType.class);

    Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>();
    inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName());
    PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input",
            "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin);

    Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>();
    outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName());
    PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output",
            "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin);

    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

    TransMeta transMeta = null;
    TransConfiguration transConfig = null;

    if (mapperTransformationFile != null) {
        conf.setMapRunnerClass(PentahoMapRunnable.class);
        transMeta = new TransMeta(mapperTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-map-xml", transConfig.getXML());
        conf.set("transformation-map-input-stepname", "Injector");
        conf.set("transformation-map-output-stepname", "Output");
    }

    if (combinerTransformationFile != null) {
        conf.setCombinerClass(GenericTransCombiner.class);
        transMeta = new TransMeta(combinerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-combiner-xml", transConfig.getXML());
        conf.set("transformation-combiner-input-stepname", "Injector");
        conf.set("transformation-combiner-output-stepname", "Output");
    }

    if (reducerTransformationFile != null) {
        conf.setReducerClass((Class<? extends Reducer>) GenericTransReduce.class);
        transMeta = new TransMeta(reducerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-reduce-xml", transConfig.getXML());
        conf.set("transformation-reduce-input-stepname", "Injector");
        conf.set("transformation-reduce-output-stepname", "Output");
    }

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path("/"));
    FileOutputFormat.setOutputPath(conf, new Path("/"));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJar(jar.toURI().toURL().toExternalForm());
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    return conf;
}

From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIntegrationTest.java

License:Apache License

public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile,
        String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort)
        throws IOException, KettleException {

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    KettleEnvironment.init();/* ww w .ja v  a 2 s.  c o  m*/

    // Register Map/Reduce Input and Map/Reduce Output plugin steps
    PluginMainClassType mainClassTypesAnnotation = StepPluginType.class
            .getAnnotation(PluginMainClassType.class);

    Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>();
    inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName());
    PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input",
            "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin);

    Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>();
    outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName());
    PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output",
            "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin);

    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

    TransMeta transMeta = null;
    TransConfiguration transConfig = null;

    if (mapperTransformationFile != null) {
        conf.setMapRunnerClass(PentahoMapRunnable.class);
        transMeta = new TransMeta(mapperTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-map-xml", transConfig.getXML());
        conf.set("transformation-map-input-stepname", "Injector");
        conf.set("transformation-map-output-stepname", "Output");
    }

    if (combinerTransformationFile != null) {
        conf.setCombinerClass(GenericTransCombiner.class);
        transMeta = new TransMeta(combinerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-combiner-xml", transConfig.getXML());
        conf.set("transformation-combiner-input-stepname", "Injector");
        conf.set("transformation-combiner-output-stepname", "Output");
    }

    if (reducerTransformationFile != null) {
        conf.setReducerClass(GenericTransReduce.class);
        transMeta = new TransMeta(reducerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-reduce-xml", transConfig.getXML());
        conf.set("transformation-reduce-input-stepname", "Injector");
        conf.set("transformation-reduce-output-stepname", "Output");
    }

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path("/"));
    FileOutputFormat.setOutputPath(conf, new Path("/"));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJar(jar.toURI().toURL().toExternalForm());
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    return conf;
}

From source file:org.pentaho.hadoop.mapreduce.test.TestSubmitMapReduceJob.java

License:Open Source License

@Test
public void submitJob() throws Exception {

    String[] args = { "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/input",
            "hdfs://" + hostname + ":" + hdfsPort + "/junit/wordcount/output" };

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./test-res/pentaho-mapreduce-sample.jar");

    URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() });

    conf.setMapperClass(//from  w  ww . j  a va 2 s .  co m
            (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Map"));
    conf.setCombinerClass((Class<? extends Reducer>) loader
            .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce"));
    conf.setReducerClass((Class<? extends Reducer>) loader
            .loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount$Reduce"));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJarByClass(loader.loadClass("org.pentaho.hadoop.mapreduce.sample.MRWordCount"));
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    JobClient jobClient = new JobClient(conf);
    ClusterStatus status = jobClient.getClusterStatus();
    assertEquals(State.RUNNING, status.getJobTrackerState());

    RunningJob runningJob = jobClient.submitJob(conf);
    System.out.print("Running " + runningJob.getJobName() + "");
    while (!runningJob.isComplete()) {
        System.out.print(".");
        Thread.sleep(500);
    }
    System.out.println();
    System.out.println("Finished " + runningJob.getJobName() + ".");

    FileObject file = fsManager.resolveFile(buildHDFSURL("/junit/wordcount/output/part-00000"));
    String output = IOUtils.toString(file.getContent().getInputStream());
    assertEquals("Bye\t1\nGoodbye\t1\nHadoop\t2\nHello\t2\nWorld\t2\n", output);
}