Example usage for org.apache.hadoop.mapred JobConf getJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getJobName.

Prototype

public String getJobName()

Source Link

Document

Get the user-specified job name.

Usage

From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java

License:Open Source License

public static void main(String[] args) {

    String accessKey = args[0];/*  www .j a v a  2  s . co m*/
    String secretKey = args[1];

    String paths[] = {
            // "2008/06",
            // "2008/07",
            // "2008/08",
            // "2008/09",
            // "2008/10",
            // "2008/11",
            "2009" };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

        LOG.info("Processing Path:" + paths[pathIndex]);

        JobConf job = new JobConf(S3GetMetdataJob.class);

        Path tempDir = new Path(
                job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
        System.out.println("Output Path is:" + tempDir);

        job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

        // setup s3 properties
        JetS3tARCSource.setMaxRetries(job, 1);
        // set up S3 credentials ...
        JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
        JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
        ARCSplitCalculator.setFilesPerSplit(job, 25);
        // set up arc reader properties
        ArcFileReader.setIOTimeoutValue(30000);
        // set input prefixes ...
        JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
        // and S3 bucket name ...
        JetS3tARCSource.setBucketName(job, "commoncrawl");
        // and setup arc source for ArcInputFormat
        ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
        // and set up input format ...
        job.setInputFormat(ARCInputFormat.class);
        // set mapper ...
        job.setMapRunnerClass(S3GetMetdataJob.class);
        // setup reducer (identity in this case ... )
        job.setReducerClass(IdentityReducer.class);
        // standard output format ...
        job.setOutputFormat(SequenceFileOutputFormat.class);
        // set output path
        job.setOutputPath(tempDir);
        // map output types
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CrawlURLMetadata.class);
        // reduce output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlURLMetadata.class);
        // double the number of reducers ...
        // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

        // run the job ...
        try {
            LOG.info("Starting Job:" + job.getJobName());
            JobClient.runJob(job);
            LOG.info("Finished Job:" + job.getJobName());

            Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
            LOG.info("Copying Job Output to:" + finalPath);
            FileSystem fs = FileSystem.get(job);

            try {
                fs.mkdirs(finalPath.getParent());
                fs.rename(tempDir, finalPath);
                LOG.info("Copied Job Output to:" + finalPath);
            } finally {
                // fs.close();
            }

        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            e.printStackTrace();
        }
    }
}

From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java

License:Open Source License

/**
 * main routine//  www.  j ava  2  s .co m
 * 
 * @param args
 */
public static void main(String[] args) {

    // amazon access key - passed on command line
    String accessKey = args[0];
    // amazon secret key - passed on command line
    String secretKey = args[1];
    // regular expression to match against - passed in command line
    String regEx = args[2];
    // group number to extract
    int groupNumber = Integer.parseInt(args[3]);

    /** arc files names start with year then month **/
    // we want to process all files uploaded in 2009
    // so, we will use the prefix string "2009",
    // buy you could, for example pass in a more restrictive
    // pattern such as "2008/06".

    String inputPrefix = "2009";

    LOG.info("Processing Path:" + inputPrefix);

    // allocate job config
    JobConf job = new JobConf(SampleHadoopJob.class);
    // set job name
    job.setJobName("Sample RegEx Job against path:" + inputPrefix);
    // set regular expression attributes
    job.set("mapred.mapper.regex", regEx);
    job.setInt("mapred.mapper.regex.group", groupNumber);

    // create temp file pth
    Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis());

    LOG.info("Output for job " + job.getJobName() + " is:" + tempDir);

    // we are going to be using the JetS3ARCSource as an input source to
    // the ArcInputFormat. This input source uses the multi-threaded jets3
    // library to request data from S3.

    /** setup s3 properties **/

    // set the number of retries per ARC file.
    // we are setting this number to one, so if an IOException
    // occurs when processing an ARCFile, we are going to silently skip it
    // and continue processing the next ARC file. You should set this to be
    // a number LESS than mapred.max.tracker.failures (as defined in your
    // job config or hadoop-site.xml). Otherwise, your entire job could
    // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce
    // exhibits a failure condition specific to a single key or set of keys.
    JetS3tARCSource.setMaxRetries(job, 1);

    // set up S3 credentials ...
    JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
    JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);

    // set the number of files per split
    // set this number higher if the bucket contains lots of files, to reduce
    // the burden on the map-reduce system from tracking too many file splits.
    ARCSplitCalculator.setFilesPerSplit(job, 25);

    /** set up arc reader properties **/

    // again, set the timeout to something reasonable, so that your entire job
    // will not hang if a single GET request fails to complete in a reasonable
    // amount of time
    ArcFileReader.setIOTimeoutValue(30000);
    // set input prefixes ...
    JetS3tARCSource.setInputPrefixes(job, inputPrefix);
    // and S3 bucket name ...
    JetS3tARCSource.setBucketName(job, "commoncrawl");
    // and setup arc source for ArcInputFormat
    ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);

    // now inform the job that it needs to use the ARCInputFormat
    job.setInputFormat(ARCInputFormat.class);

    // set up our map runner class
    // we use a map runner instead of a mapper here to give us an extra level of
    // control over how we handle errors. When running a large job against
    // the crawl corpus which may contain hunders of thousands of ARC files, it
    // is extremely important to reduce the risks of abnormal job termination.
    job.setMapRunnerClass(SampleHadoopJob.class);

    // setup reducer (identity in this case ... )
    job.setReducerClass(IdentityReducer.class);
    // standard output format ...
    job.setOutputFormat(SequenceFileOutputFormat.class);
    // set output path
    job.setOutputPath(tempDir);
    // map output types
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // run the job ...
    try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
    }
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopUtils.java

License:Open Source License

/**
 * Creates JobInfo from hadoop configuration.
 *
 * @param cfg Hadoop configuration.//  ww  w. j a v a2s.  co m
 * @return Job info.
 * @throws GridException If failed.
 */
public static GridHadoopDefaultJobInfo createJobInfo(Configuration cfg) throws GridException {
    JobConf jobConf = new JobConf(cfg);

    boolean hasCombiner = jobConf.get("mapred.combiner.class") != null
            || jobConf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null;

    int numReduces = jobConf.getNumReduceTasks();

    jobConf.setBooleanIfUnset("mapred.mapper.new-api", jobConf.get(OLD_MAP_CLASS_ATTR) == null);

    if (jobConf.getUseNewMapper()) {
        String mode = "new map API";

        ensureNotSet(jobConf, "mapred.input.format.class", mode);
        ensureNotSet(jobConf, OLD_MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, "mapred.partitioner.class", mode);
        else
            ensureNotSet(jobConf, "mapred.output.format.class", mode);
    } else {
        String mode = "map compatibility";

        ensureNotSet(jobConf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode);
        ensureNotSet(jobConf, MRJobConfig.MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, MRJobConfig.PARTITIONER_CLASS_ATTR, mode);
        else
            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
    }

    if (numReduces != 0) {
        jobConf.setBooleanIfUnset("mapred.reducer.new-api", jobConf.get(OLD_REDUCE_CLASS_ATTR) == null);

        if (jobConf.getUseNewReducer()) {
            String mode = "new reduce API";

            ensureNotSet(jobConf, "mapred.output.format.class", mode);
            ensureNotSet(jobConf, OLD_REDUCE_CLASS_ATTR, mode);
        } else {
            String mode = "reduce compatibility";

            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
            ensureNotSet(jobConf, MRJobConfig.REDUCE_CLASS_ATTR, mode);
        }
    }

    Map<String, String> props = new HashMap<>();

    for (Map.Entry<String, String> entry : jobConf)
        props.put(entry.getKey(), entry.getValue());

    return new GridHadoopDefaultJobInfo(jobConf.getJobName(), jobConf.getUser(), hasCombiner, numReduces,
            props);
}

From source file:org.macau.util.FuzzyJoinDriver.java

License:Apache License

/**
 * /* w  ww  . j  ava 2 s .c om*/
 * @param job
 * @throws IOException
 * run the job and output the basic information of the job
 * the start time
 * the finished time
 * the running time(finished_Time - start_Time)
 */
public static void run(JobConf job) throws IOException {
    job.setJarByClass(FuzzyJoinDriver.class);
    //
    // print info
    //
    String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + "  Input Path:  {";
    Path inputs[] = FileInputFormat.getInputPaths(job);
    for (int ctr = 0; ctr < inputs.length; ctr++) {
        if (ctr > 0) {
            ret += "\n                ";
        }
        ret += inputs[ctr].toString();
    }
    ret += "}\n";
    ret += "  Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + "  Map Jobs:    "
            + job.getNumMapTasks() + "\n" + "  Reduce Jobs: " + job.getNumReduceTasks() + "\n"
            + "  Properties:  {";
    String[][] properties = new String[][] {
            new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE },
            new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
                    "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE },
            new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE },
            new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE },
            new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE },
            new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE },
            new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE },
            new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" },
            new String[] { DATA_JOININDEX_PROPERTY, "" }, };
    for (int crt = 0; crt < properties.length; crt++) {
        if (crt > 0) {
            ret += "\n                ";
        }
        ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]);
    }
    ret += "}";
    System.out.println(ret);
    //
    // run job
    //
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println(
            "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds.");
}

From source file:org.pooledtimeseries.MeanChiSquareDistanceCalculation.java

License:Apache License

public static void main(String[] args) throws Exception {

    Configuration baseConf = new Configuration();
    baseConf.set("mapreduce.job.maps", "96");
    baseConf.set("mapred.tasktracker.map.tasks.maximum", "96");

    JobConf conf = new JobConf(baseConf, MeanChiSquareDistanceCalculation.class);
    System.out.println("Before Map:" + conf.getNumMapTasks());
    conf.setNumMapTasks(96);/*w  w w  .java 2 s. co  m*/
    System.out.println("After Map:" + conf.getNumMapTasks());

    conf.setJobName("mean_chi_square_calculation");

    System.out.println("Track:" + baseConf.get("mapred.job.tracker"));
    System.out.println("Job Name- " + conf.getJobName());
    System.out.println(baseConf.get("mapreduce.job.maps"));

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(DoubleWritable.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInputFormat(CartesianInputFormat.class);
    CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, args[0]);
    CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, args[0]);

    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    JobClient.runJob(conf);
}

From source file:org.pooledtimeseries.SimilarityCalculation.java

License:Apache License

public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf();
    System.out.println("Before Map:" + conf.getNumMapTasks());
    conf.setNumMapTasks(196);/*from  ww w  .  j  av  a  2s . co  m*/
    System.out.println("After Map:" + conf.getNumMapTasks());
    conf.setJobName("similarity_calc");

    conf.set("meanDistsFilePath", args[2]);

    System.out.println("Job Name: " + conf.getJobName());
    conf.setJarByClass(SimilarityCalculation.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setInputFormat(CartesianInputFormat.class);
    CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, args[0]);
    CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, args[0]);

    conf.setOutputFormat(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setMapperClass(Map.class);

    JobClient.runJob(conf);
}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Add the given object to the distributed cache for this job
 * //from w  w w  .ja va  2 s.c o  m
 * @param obj A Serializable object to add to the JobConf
 * @param job The JobConf
 */
public static <T extends Serializable> void setSerializableInCache(JobConf job, T serializable) {
    try {
        // TODO: MED /tmp should be changed by conf.getTempDir() or
        // something
        Path workDir = new Path(
                String.format("/tmp/%s/%s/_join.temporary", job.getJobName(), System.currentTimeMillis()));

        Path tempPath = new Path(workDir, "serializable.dat");
        tempPath.getFileSystem(job).deleteOnExit(tempPath);
        job.set("serializables.file", tempPath.toUri().getPath());

        ObjectOutputStream objectStream = new ObjectOutputStream(tempPath.getFileSystem(job).create(tempPath));
        objectStream.writeObject(serializable);
        objectStream.close();

        DistributedCache.addCacheFile(new URI(tempPath.toUri().getPath() + "#" + tempPath.getName()), job);
    } catch (URISyntaxException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}