Example usage for org.apache.hadoop.mapred JobConf setMapRunnerClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapRunnerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapRunnerClass.

Prototype

public void setMapRunnerClass(Class<? extends MapRunnable> theClass) 

Source Link

Document

Expert: Set the MapRunnable class for the job.

Usage

From source file:org.apache.nutch.fetcher.OldFetcher.java

License:Apache License

public void fetch(Path segment, int threads) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("OldFetcher: starting at " + sdf.format(start));
        LOG.info("OldFetcher: segment: " + segment);
    }// ww w .j a v a 2 s .co  m

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(OldFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.selenium.fetcher.SeleniumFetcher.java

License:Apache License

public void fetch(Path segment, int threads, String zippedDriverPath) throws IOException, URISyntaxException {

    checkConfiguration();//from  ww  w .  ja  v a  2s  .co m

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    // push the zipped_webdriver binaries onto the DistributedCache
    DistributedCache.addCacheArchive(new URI(zippedDriverPath), job);

    job.set("webdriver.binaries.path", zippedDriverPath);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(SeleniumFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapreduceExec.MapReduceLauncher.java

License:Apache License

/**
 * Submit a Pig job to hadoop.//  w  w w. j a  va  2 s.  co m
 * 
 * @param mapFuncs
 *            a list of map functions to apply to the inputs. The cardinality of the list should
 *            be the same as input's cardinality.
 * @param groupFuncs
 *            a list of grouping functions to apply to the inputs. The cardinality of the list
 *            should be the same as input's cardinality.
 * @param reduceFunc
 *            the reduce function.
 * @param mapTasks
 *            the number of map tasks to use.
 * @param reduceTasks
 *            the number of reduce tasks to use.
 * @param input
 *            a list of inputs
 * @param output
 *            the path of the output.
 * @return an indicator of success or failure.
 * @throws IOException
 */
public boolean launchPig(POMapreduce pom) throws IOException {
    JobConf conf = new JobConf(config);
    setJobProperties(conf, pom);
    Properties properties = pom.pigContext.getProperties();
    ConfigurationValidator.validatePigProperties(properties);
    String jobName = properties.getProperty(PigContext.JOB_NAME);
    conf.setJobName(jobName);
    boolean success = false;
    List<String> funcs = new ArrayList<String>();

    if (pom.toMap != null) {
        for (EvalSpec es : pom.toMap)
            funcs.addAll(es.getFuncs());
    }
    if (pom.groupFuncs != null) {
        for (EvalSpec es : pom.groupFuncs)
            funcs.addAll(es.getFuncs());
    }
    if (pom.toReduce != null) {
        funcs.addAll(pom.toReduce.getFuncs());
    }

    // create jobs.jar locally and pass it to hadoop
    File submitJarFile = File.createTempFile("Job", ".jar");
    try {
        FileOutputStream fos = new FileOutputStream(submitJarFile);
        JarManager.createJar(fos, funcs, null, pom.pigContext);
        log.debug("Job jar size = " + submitJarFile.length());
        conf.setJar(submitJarFile.getPath());
        String user = System.getProperty("user.name");
        conf.setUser(user != null ? user : "Pigster");

        conf.set("pig.spill.size.threshold", properties.getProperty("pig.spill.size.threshold"));
        conf.set("pig.spill.gc.activation.size", properties.getProperty("pig.spill.gc.activation.size"));

        if (pom.reduceParallelism != -1) {
            conf.setNumReduceTasks(pom.reduceParallelism);
        }
        if (pom.toMap != null) {
            conf.set("pig.mapFuncs", ObjectSerializer.serialize(pom.toMap));
        }
        if (pom.toCombine != null) {
            conf.set("pig.combineFunc", ObjectSerializer.serialize(pom.toCombine));
            // this is to make sure that combiner is only called once
            // since we can't handle no combine or multiple combines
            conf.setCombineOnceOnly(true);
        }
        if (pom.groupFuncs != null) {
            conf.set("pig.groupFuncs", ObjectSerializer.serialize(pom.groupFuncs));
        }
        if (pom.toReduce != null) {
            conf.set("pig.reduceFunc", ObjectSerializer.serialize(pom.toReduce));
        }
        if (pom.toSplit != null) {
            conf.set("pig.splitSpec", ObjectSerializer.serialize(pom.toSplit));
        }
        if (pom.pigContext != null) {
            conf.set("pig.pigContext", ObjectSerializer.serialize(pom.pigContext));
        }
        conf.setMapRunnerClass(PigMapReduce.class);
        if (pom.toCombine != null) {
            conf.setCombinerClass(PigCombine.class);
            //conf.setCombinerClass(PigMapReduce.class);
        }
        if (pom.quantilesFile != null) {
            conf.set("pig.quantilesFile", pom.quantilesFile);
        } else {
            // this is not a sort job - can use byte comparison to speed up processing
            conf.setOutputKeyComparatorClass(PigWritableComparator.class);
        }
        if (pom.partitionFunction != null) {
            conf.setPartitionerClass(SortPartitioner.class);
        }
        conf.setReducerClass(PigMapReduce.class);
        conf.setInputFormat(PigInputFormat.class);
        conf.setOutputFormat(PigOutputFormat.class);
        // not used starting with 0.15 conf.setInputKeyClass(Text.class);
        // not used starting with 0.15 conf.setInputValueClass(Tuple.class);
        conf.setOutputKeyClass(Tuple.class);
        if (pom.userComparator != null) {
            conf.setOutputKeyComparatorClass(pom.userComparator);
        }
        conf.setOutputValueClass(IndexedTuple.class);
        conf.set("pig.inputs", ObjectSerializer.serialize(pom.inputFileSpecs));

        conf.setOutputPath(new Path(pom.outputFileSpec.getFileName()));
        conf.set("pig.storeFunc", ObjectSerializer.serialize(pom.outputFileSpec.getFuncSpec()));

        // Setup the DistributedCache for this job
        setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.ship.files", true);
        setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.cache.files", false);

        // Setup the logs directory for this job
        String jobOutputFileName = pom.pigContext.getJobOutputFile();
        if (jobOutputFileName != null && jobOutputFileName.length() > 0) {
            Path jobOutputFile = new Path(pom.pigContext.getJobOutputFile());
            conf.set("pig.output.dir", jobOutputFile.getParent().toString());
            conf.set("pig.streaming.log.dir", new Path(jobOutputFile, LOG_DIR).toString());
        }

        //
        // Now, actually submit the job (using the submit name)
        //
        JobClient jobClient = execEngine.getJobClient();
        RunningJob status = jobClient.submitJob(conf);
        log.debug("submitted job: " + status.getJobID());

        long sleepTime = 1000;
        double lastQueryProgress = -1.0;
        int lastJobsQueued = -1;
        double lastMapProgress = -1.0;
        double lastReduceProgress = -1.0;
        while (true) {
            try {
                Thread.sleep(sleepTime);
            } catch (Exception e) {
            }

            if (status.isComplete()) {
                success = status.isSuccessful();
                if (log.isDebugEnabled()) {
                    StringBuilder sb = new StringBuilder();
                    sb.append("Job finished ");
                    sb.append((success ? "" : "un"));
                    sb.append("successfully");
                    log.debug(sb.toString());
                }
                if (success) {
                    mrJobNumber++;
                }
                double queryProgress = ((double) mrJobNumber) / ((double) numMRJobs);
                if (queryProgress > lastQueryProgress) {
                    if (log.isInfoEnabled()) {
                        StringBuilder sbProgress = new StringBuilder();
                        sbProgress.append("Pig progress = ");
                        sbProgress.append(((int) (queryProgress * 100)));
                        sbProgress.append("%");
                        log.info(sbProgress.toString());
                    }
                    lastQueryProgress = queryProgress;
                }
                break;
            } else // still running
            {
                double mapProgress = status.mapProgress();
                double reduceProgress = status.reduceProgress();
                if (lastMapProgress != mapProgress || lastReduceProgress != reduceProgress) {
                    if (log.isDebugEnabled()) {
                        StringBuilder sbProgress = new StringBuilder();
                        sbProgress.append("Hadoop job progress: Map=");
                        sbProgress.append((int) (mapProgress * 100));
                        sbProgress.append("% Reduce=");
                        sbProgress.append((int) (reduceProgress * 100));
                        sbProgress.append("%");
                        log.debug(sbProgress.toString());
                    }
                    lastMapProgress = mapProgress;
                    lastReduceProgress = reduceProgress;
                }
                double numJobsCompleted = mrJobNumber;
                double thisJobProgress = (mapProgress + reduceProgress) / 2.0;
                double queryProgress = (numJobsCompleted + thisJobProgress) / ((double) numMRJobs);
                if (queryProgress > lastQueryProgress) {
                    if (log.isInfoEnabled()) {
                        StringBuilder sbProgress = new StringBuilder();
                        sbProgress.append("Pig progress = ");
                        sbProgress.append(((int) (queryProgress * 100)));
                        sbProgress.append("%");
                        log.info(sbProgress.toString());
                    }
                    lastQueryProgress = queryProgress;
                }
            }
        }

        // bug 1030028: if the input file is empty; hadoop doesn't create the output file!
        Path outputFile = conf.getOutputPath();
        String outputName = outputFile.getName();
        int colon = outputName.indexOf(':');
        if (colon != -1) {
            outputFile = new Path(outputFile.getParent(), outputName.substring(0, colon));
        }

        try {
            ElementDescriptor descriptor = ((HDataStorage) (pom.pigContext.getDfs()))
                    .asElement(outputFile.toString());

            if (success && !descriptor.exists()) {

                // create an empty output file
                PigFile f = new PigFile(outputFile.toString(), false);
                f.store(BagFactory.getInstance().newDefaultBag(), new PigStorage(), pom.pigContext);
            }
        } catch (DataStorageException e) {
            throw WrappedIOException.wrap("Failed to obtain descriptor for " + outputFile.toString(), e);
        }

        if (!success) {
            // go find the error messages
            getErrorMessages(jobClient.getMapTaskReports(status.getJobID()), "map");
            getErrorMessages(jobClient.getReduceTaskReports(status.getJobID()), "reduce");
        } else {
            long timeSpent = 0;

            // NOTE: this call is crashing due to a bug in Hadoop; the bug is known and the patch has not been applied yet.
            TaskReport[] mapReports = jobClient.getMapTaskReports(status.getJobID());
            TaskReport[] reduceReports = jobClient.getReduceTaskReports(status.getJobID());
            for (TaskReport r : mapReports) {
                timeSpent += (r.getFinishTime() - r.getStartTime());
            }
            for (TaskReport r : reduceReports) {
                timeSpent += (r.getFinishTime() - r.getStartTime());
            }
            totalHadoopTimeSpent += timeSpent;
        }
    } catch (Exception e) {
        // Do we need different handling for different exceptions
        e.printStackTrace();
        throw WrappedIOException.wrap(e);
    } finally {
        submitJarFile.delete();
    }
    return success;
}

From source file:org.archive.access.nutch.jobs.ImportArcs.java

License:LGPL

public void importArcs(final Path arcUrlsDir, final Path segment, final String collection) throws IOException {
    LOG.info("ImportArcs segment: " + segment + ", src: " + arcUrlsDir);

    final JobConf job = new JobConf(getConf(), this.getClass());

    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    job.setInputPath(arcUrlsDir);//from w  ww.j a va 2s.c o m

    //job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class));
    //job.setMapperClass(job.getClass("wax.import.mapper", this.getClass()));
    job.setMapRunnerClass(ARCMapRunner.class); // compatible with hadoop 0.14 TODO MC
    job.setMapperClass(this.getClass());

    job.setInputFormat(TextInputFormat.class);

    job.setOutputPath(segment);
    job.setOutputFormat(WaxFetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FetcherOutput.class);

    // Pass the collection name out to the tasks IF non-null.
    if ((collection != null) && (collection.length() > 0)) {
        job.set(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY, collection);
    }
    job.setJobName("import " + arcUrlsDir + " " + segment);

    JobClient.runJob(job);
    LOG.info("ImportArcs: done");
}

From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java

License:Open Source License

public static void main(String[] args) {

    String accessKey = args[0];/*  ww  w.  jav  a  2 s.  c  o m*/
    String secretKey = args[1];

    String paths[] = {
            // "2008/06",
            // "2008/07",
            // "2008/08",
            // "2008/09",
            // "2008/10",
            // "2008/11",
            "2009" };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

        LOG.info("Processing Path:" + paths[pathIndex]);

        JobConf job = new JobConf(S3GetMetdataJob.class);

        Path tempDir = new Path(
                job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
        System.out.println("Output Path is:" + tempDir);

        job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

        // setup s3 properties
        JetS3tARCSource.setMaxRetries(job, 1);
        // set up S3 credentials ...
        JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
        JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
        ARCSplitCalculator.setFilesPerSplit(job, 25);
        // set up arc reader properties
        ArcFileReader.setIOTimeoutValue(30000);
        // set input prefixes ...
        JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
        // and S3 bucket name ...
        JetS3tARCSource.setBucketName(job, "commoncrawl");
        // and setup arc source for ArcInputFormat
        ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
        // and set up input format ...
        job.setInputFormat(ARCInputFormat.class);
        // set mapper ...
        job.setMapRunnerClass(S3GetMetdataJob.class);
        // setup reducer (identity in this case ... )
        job.setReducerClass(IdentityReducer.class);
        // standard output format ...
        job.setOutputFormat(SequenceFileOutputFormat.class);
        // set output path
        job.setOutputPath(tempDir);
        // map output types
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CrawlURLMetadata.class);
        // reduce output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlURLMetadata.class);
        // double the number of reducers ...
        // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

        // run the job ...
        try {
            LOG.info("Starting Job:" + job.getJobName());
            JobClient.runJob(job);
            LOG.info("Finished Job:" + job.getJobName());

            Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
            LOG.info("Copying Job Output to:" + finalPath);
            FileSystem fs = FileSystem.get(job);

            try {
                fs.mkdirs(finalPath.getParent());
                fs.rename(tempDir, finalPath);
                LOG.info("Copied Job Output to:" + finalPath);
            } finally {
                // fs.close();
            }

        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            e.printStackTrace();
        }
    }
}

From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java

License:Open Source License

/**
 * main routine/*  w  ww .  j ava 2  s .c o m*/
 * 
 * @param args
 */
public static void main(String[] args) {

    // amazon access key - passed on command line
    String accessKey = args[0];
    // amazon secret key - passed on command line
    String secretKey = args[1];
    // regular expression to match against - passed in command line
    String regEx = args[2];
    // group number to extract
    int groupNumber = Integer.parseInt(args[3]);

    /** arc files names start with year then month **/
    // we want to process all files uploaded in 2009
    // so, we will use the prefix string "2009",
    // buy you could, for example pass in a more restrictive
    // pattern such as "2008/06".

    String inputPrefix = "2009";

    LOG.info("Processing Path:" + inputPrefix);

    // allocate job config
    JobConf job = new JobConf(SampleHadoopJob.class);
    // set job name
    job.setJobName("Sample RegEx Job against path:" + inputPrefix);
    // set regular expression attributes
    job.set("mapred.mapper.regex", regEx);
    job.setInt("mapred.mapper.regex.group", groupNumber);

    // create temp file pth
    Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis());

    LOG.info("Output for job " + job.getJobName() + " is:" + tempDir);

    // we are going to be using the JetS3ARCSource as an input source to
    // the ArcInputFormat. This input source uses the multi-threaded jets3
    // library to request data from S3.

    /** setup s3 properties **/

    // set the number of retries per ARC file.
    // we are setting this number to one, so if an IOException
    // occurs when processing an ARCFile, we are going to silently skip it
    // and continue processing the next ARC file. You should set this to be
    // a number LESS than mapred.max.tracker.failures (as defined in your
    // job config or hadoop-site.xml). Otherwise, your entire job could
    // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce
    // exhibits a failure condition specific to a single key or set of keys.
    JetS3tARCSource.setMaxRetries(job, 1);

    // set up S3 credentials ...
    JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
    JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);

    // set the number of files per split
    // set this number higher if the bucket contains lots of files, to reduce
    // the burden on the map-reduce system from tracking too many file splits.
    ARCSplitCalculator.setFilesPerSplit(job, 25);

    /** set up arc reader properties **/

    // again, set the timeout to something reasonable, so that your entire job
    // will not hang if a single GET request fails to complete in a reasonable
    // amount of time
    ArcFileReader.setIOTimeoutValue(30000);
    // set input prefixes ...
    JetS3tARCSource.setInputPrefixes(job, inputPrefix);
    // and S3 bucket name ...
    JetS3tARCSource.setBucketName(job, "commoncrawl");
    // and setup arc source for ArcInputFormat
    ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);

    // now inform the job that it needs to use the ARCInputFormat
    job.setInputFormat(ARCInputFormat.class);

    // set up our map runner class
    // we use a map runner instead of a mapper here to give us an extra level of
    // control over how we handle errors. When running a large job against
    // the crawl corpus which may contain hunders of thousands of ARC files, it
    // is extremely important to reduce the risks of abnormal job termination.
    job.setMapRunnerClass(SampleHadoopJob.class);

    // setup reducer (identity in this case ... )
    job.setReducerClass(IdentityReducer.class);
    // standard output format ...
    job.setOutputFormat(SequenceFileOutputFormat.class);
    // set output path
    job.setOutputPath(tempDir);
    // map output types
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // run the job ...
    try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
    }
}

From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java

License:Open Source License

public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile,
        String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort)
        throws IOException, KettleException {

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    KettleEnvironment.init();// w  ww  .  ja  v a  2  s  .c  om

    // Register Map/Reduce Input and Map/Reduce Output plugin steps
    PluginMainClassType mainClassTypesAnnotation = StepPluginType.class
            .getAnnotation(PluginMainClassType.class);

    Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>();
    inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName());
    PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input",
            "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin);

    Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>();
    outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName());
    PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output",
            "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin);

    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

    TransMeta transMeta = null;
    TransConfiguration transConfig = null;

    if (mapperTransformationFile != null) {
        conf.setMapRunnerClass(PentahoMapRunnable.class);
        transMeta = new TransMeta(mapperTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-map-xml", transConfig.getXML());
        conf.set("transformation-map-input-stepname", "Injector");
        conf.set("transformation-map-output-stepname", "Output");
    }

    if (combinerTransformationFile != null) {
        conf.setCombinerClass(GenericTransCombiner.class);
        transMeta = new TransMeta(combinerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-combiner-xml", transConfig.getXML());
        conf.set("transformation-combiner-input-stepname", "Injector");
        conf.set("transformation-combiner-output-stepname", "Output");
    }

    if (reducerTransformationFile != null) {
        conf.setReducerClass((Class<? extends Reducer>) GenericTransReduce.class);
        transMeta = new TransMeta(reducerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-reduce-xml", transConfig.getXML());
        conf.set("transformation-reduce-input-stepname", "Injector");
        conf.set("transformation-reduce-output-stepname", "Output");
    }

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path("/"));
    FileOutputFormat.setOutputPath(conf, new Path("/"));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJar(jar.toURI().toURL().toExternalForm());
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    return conf;
}

From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIntegrationTest.java

License:Apache License

public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile,
        String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort)
        throws IOException, KettleException {

    JobConf conf = new JobConf();
    conf.setJobName("wordcount");

    KettleEnvironment.init();/* w  w  w .  j ava2 s  .c om*/

    // Register Map/Reduce Input and Map/Reduce Output plugin steps
    PluginMainClassType mainClassTypesAnnotation = StepPluginType.class
            .getAnnotation(PluginMainClassType.class);

    Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>();
    inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName());
    PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input",
            "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin);

    Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>();
    outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName());
    PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class,
            mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output",
            "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap,
            new ArrayList<String>(), null, null);
    PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin);

    TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

    TransMeta transMeta = null;
    TransConfiguration transConfig = null;

    if (mapperTransformationFile != null) {
        conf.setMapRunnerClass(PentahoMapRunnable.class);
        transMeta = new TransMeta(mapperTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-map-xml", transConfig.getXML());
        conf.set("transformation-map-input-stepname", "Injector");
        conf.set("transformation-map-output-stepname", "Output");
    }

    if (combinerTransformationFile != null) {
        conf.setCombinerClass(GenericTransCombiner.class);
        transMeta = new TransMeta(combinerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-combiner-xml", transConfig.getXML());
        conf.set("transformation-combiner-input-stepname", "Injector");
        conf.set("transformation-combiner-output-stepname", "Output");
    }

    if (reducerTransformationFile != null) {
        conf.setReducerClass(GenericTransReduce.class);
        transMeta = new TransMeta(reducerTransformationFile);
        transConfig = new TransConfiguration(transMeta, transExecConfig);
        conf.set("transformation-reduce-xml", transConfig.getXML());
        conf.set("transformation-reduce-input-stepname", "Injector");
        conf.set("transformation-reduce-output-stepname", "Output");
    }

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path("/"));
    FileOutputFormat.setOutputPath(conf, new Path("/"));

    conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort);
    conf.set("mapred.job.tracker", hostname + ":" + trackerPort);

    conf.setJar(jar.toURI().toURL().toExternalForm());
    conf.setWorkingDirectory(new Path("/tmp/wordcount"));

    return conf;
}

From source file:sa.edu.kaust.fwindex.BuildIntDocVectorsForwardIndex.java

License:Apache License

/**
 * Runs this tool./*from  w ww. ja  v a 2  s. c om*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }
    String inPath = args[0];
    String outPath = args[1];

    JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = 10;
    sLogger.info("Tool: BuildIntDocVectorsIndex");

    String intDocVectorsPath = inPath;
    String forwardIndexPath = outPath;

    if (!fs.exists(new Path(intDocVectorsPath))) {
        sLogger.info("Error: IntDocVectors don't exist!");
        return 0;
    }

    if (fs.exists(new Path(forwardIndexPath))) {
        sLogger.info("IntDocVectorsForwardIndex already exists: skipping!");
        return 0;
    }

    conf.set("ForwardIndexPath", forwardIndexPath);

    conf.setJobName("BuildIntDocVectorsForwardIndex");

    Path inputPath = new Path(intDocVectorsPath);
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(TermDF.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:sa.edu.kaust.twitter.index.BuildPostingsForwardIndex.java

License:Apache License

/**
 * Runs this tool./*ww w  .  j av  a2s .c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    JobConf conf = new JobConf(BuildPostingsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = 10;
    sLogger.info("Tool: PostingsForwardIndex");

    String postingsPath = args[0];
    String forwardIndexPath = args[1];

    if (!fs.exists(new Path(postingsPath))) {
        sLogger.info("Error: IntDocVectors don't exist!");
        return 0;
    }

    // delete the output directory if it exists already
    //FileSystem.get(conf).delete(new Path(forwardIndexPath), true);
    if (fs.exists(new Path(forwardIndexPath))) {
        sLogger.info("PostingsForwardIndex already exists: skipping!");
        return 0;
    }

    conf.set("ForwardIndexPath", forwardIndexPath);

    conf.setJobName("BuildPostingsForwardIndex");

    Path inputPath = new Path(postingsPath);
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}