Example usage for org.apache.hadoop.mapred JobConf setOutputKeyComparatorClass

List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputKeyComparatorClass.

Prototype

public void setOutputKeyComparatorClass(Class<? extends RawComparator> theClass) 

Source Link

Document

Set the RawComparator comparator used to compare keys.

Usage

From source file:edu.stolaf.cs.wmrserver.HadoopEngine.java

License:Apache License

public void submit(JobRequest request, long submissionID, File mapperFile, File reducerFile, File packageDir,
        Path inputPath) throws ValidationException, NotFoundException, CompilationException, InternalException {
    // Generate job output path
    Path outputDir = new Path(_homeDir, "out");
    Path outputPath;/*from w  ww .  j av a 2s.c om*/
    try {
        FileSystem fs = outputDir.getFileSystem(new Configuration());
        outputPath = JobServiceHandler.getNonexistantPath(outputDir, request.getName(), fs);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not construct output path.", ex);
    }

    JobConf conf = new JobConf();
    conf.setJobName(request.getName());

    // Set mapper and number of tasks if specified
    StreamJob.setStreamMapper(conf, mapperFile.toString());
    if (request.isSetMapTasks())
        conf.setNumMapTasks(request.getMapTasks());

    // Set reducer and number of tasks if specified
    StreamJob.setStreamReducer(conf, reducerFile.toString());
    if (request.isSetReduceTasks())
        conf.setNumReduceTasks(request.getReduceTasks());

    // Create and set job JAR, including necessary files
    ArrayList<String> jarFiles = new ArrayList<String>();
    jarFiles.add(packageDir.toString());
    String jarPath;
    try {
        jarPath = StreamJob.createJobJar(conf, jarFiles, _tempDir);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not create job jar.", ex);
    }
    if (jarPath != null)
        conf.setJar(jarPath);

    // TODO: This is a hack. Rewrite streaming to use DistributedCache.
    //conf.setPattern("mapreduce.job.jar.unpack.pattern",
    //              Pattern.compile(".*"));

    // Set I/O formats and paths
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // Use numeric sort if appropriate
    conf.setBoolean(CONF_NUMERIC, request.isNumericSort());
    if (request.isNumericSort()) {
        conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
        conf.setPartitionerClass(KeyFieldBasedPartitioner.class);
        conf.setKeyFieldComparatorOptions("-n");
        conf.setKeyFieldPartitionerOptions("-n");
    }

    // Set other job information
    conf.set(CONF_USER, request.getUser());
    conf.set(CONF_LANGUAGE, request.getLanguage());
    conf.set(CONF_MAPPER, request.getMapper());
    conf.set(CONF_REDUCER, request.getReducer());

    // Attempt to submit the job

    RunningJob job;
    try {
        JobClient client = new JobClient(new JobConf());
        job = client.submitJob(conf);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("There was a serious error while attempting to submit the job.",
                ex);
    }

    try {
        SubmissionDatabase.setSubmitted(submissionID);
        SubmissionDatabase.setHadoopID(submissionID, job.getID().toString());
    } catch (SQLException ex) {
        throw JobServiceHandler.wrapException("Could not update submission in database.", ex);
    }
}

From source file:mahout.analysis.MahoutOA.java

public static void main(String[] args) throws Exception {

    Path outPath = new Path(OUTPUT_PATH);
    JobConf conf = new JobConf(MahoutOA.class);
    conf.setJobName("Recommended books count");

    conf.setMapperClass(MahoutOAMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setReducerClass(MahoutOAReducer.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, outPath);

    try {//from   ww  w . j ava 2 s .  c  o  m
        FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
        if (dfs.exists(outPath)) {
            dfs.delete(outPath, true);
        }
        JobClient.runJob(conf);
    } catch (Exception e) {
        e.printStackTrace();
    }

    JobConf conf1 = new JobConf(MahoutOA.class);
    Path outPathFinal = new Path(args[1]);

    conf1.setJobName("Recommended books count ordered");

    conf1.setMapperClass(MahoutOASortMapper.class);

    conf1.setMapOutputKeyClass(IntWritable.class);
    conf1.setMapOutputValueClass(Text.class);
    conf1.setOutputKeyComparatorClass(MahoutOAComparator.class);

    conf1.setReducerClass(MahoutOASortReducer.class);

    FileInputFormat.setInputPaths(conf1, new Path(OUTPUT_PATH));
    FileOutputFormat.setOutputPath(conf1, outPathFinal);

    try {
        FileSystem dfs = FileSystem.get(outPathFinal.toUri(), conf);
        if (dfs.exists(outPathFinal)) {
            dfs.delete(outPathFinal, true);
        }
        JobClient.runJob(conf1);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation./* w w w.  j  a  va2  s  . c om*/
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}

From source file:org.apache.avro.mapred.AvroJob.java

License:Apache License

private static void configureAvroShuffle(JobConf job) {
    job.setOutputKeyComparatorClass(AvroKeyComparator.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(AvroValue.class);

    // add AvroSerialization to io.serializations
    Collection<String> serializations = job.getStringCollection("io.serializations");
    if (!serializations.contains(AvroSerialization.class.getName())) {
        serializations.add(AvroSerialization.class.getName());
        job.setStrings("io.serializations", serializations.toArray(new String[0]));
    }/*  w  w w .j a v a  2  s  .  co m*/
}

From source file:org.apache.avro.mapred.tether.TetherJob.java

License:Apache License

private static void setupTetherJob(JobConf job) throws IOException {
    job.setMapRunnerClass(TetherMapRunner.class);
    job.setPartitionerClass(TetherPartitioner.class);
    job.setReducerClass(TetherReducer.class);

    job.setInputFormat(TetherInputFormat.class);
    job.setOutputFormat(TetherOutputFormat.class);

    job.setOutputKeyClass(TetherData.class);
    job.setOutputKeyComparatorClass(TetherKeyComparator.class);
    job.setMapOutputValueClass(NullWritable.class);

    // set the map output key class to TetherData
    job.setMapOutputKeyClass(TetherData.class);

    // add TetherKeySerialization to io.serializations
    Collection<String> serializations = job.getStringCollection("io.serializations");
    if (!serializations.contains(TetherKeySerialization.class.getName())) {
        serializations.add(TetherKeySerialization.class.getName());
        job.setStrings("io.serializations", serializations.toArray(new String[0]));
    }//from w w w.  ja  va  2s.  c o m

    // determine whether the executable should be added to the cache.
    if (job.getBoolean(TETHER_EXEC_CACHED, false)) {
        DistributedCache.addCacheFile(getExecutable(job), job);
    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver.java

License:Apache License

@Override
public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);
    conf.setJobName("Bayes Feature Driver running over input: " + input);
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    conf.setPartitionerClass(FeaturePartitioner.class);
    conf.setOutputKeyComparatorClass(FeatureLabelComparator.class);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);

    conf.setMapperClass(BayesFeatureMapper.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureCombiner.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);
    conf.set("io.serializations",
            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values

    HadoopUtil.overwriteOutput(output);// w  w w  .  j  a  v  a  2s . c o  m
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

}

From source file:org.apache.nutch.crawl.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation.//from w  w w .  j  ava2  s. c  o m
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
        LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:org.apache.nutch.tools.FreeGenerator.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
        System.err.println("\tinputDir\tinput directory containing one or more input files.");
        System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
        System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
        System.err.println("\t-filter\trun current URLFilters on input URLs");
        System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
        return -1;
    }/*from  w  w w.ja va2  s  . c  o m*/
    boolean filter = false;
    boolean normalize = false;
    if (args.length > 2) {
        for (int i = 2; i < args.length; i++) {
            if (args[i].equals("-filter")) {
                filter = true;
            } else if (args[i].equals("-normalize")) {
                normalize = true;
            } else {
                LOG.error("Unknown argument: " + args[i] + ", exiting ...");
                return -1;
            }
        }
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FreeGenerator: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setBoolean(FILTER_KEY, filter);
    job.setBoolean(NORMALIZE_KEY, normalize);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormat(TextInputFormat.class);
    job.setMapperClass(FG.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Generator.SelectorEntry.class);
    job.setPartitionerClass(URLPartitioner.class);
    job.setReducerClass(FG.class);
    String segName = Generator.generateSegmentName();
    job.setNumReduceTasks(job.getNumMapTasks());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        LOG.error("FAILED: " + StringUtils.stringifyException(e));
        return -1;
    }
    long end = System.currentTimeMillis();
    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
    return 0;
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapreduceExec.MapReduceLauncher.java

License:Apache License

/**
 * Submit a Pig job to hadoop.//from  ww  w  . j  a  v a2s .  co m
 * 
 * @param mapFuncs
 *            a list of map functions to apply to the inputs. The cardinality of the list should
 *            be the same as input's cardinality.
 * @param groupFuncs
 *            a list of grouping functions to apply to the inputs. The cardinality of the list
 *            should be the same as input's cardinality.
 * @param reduceFunc
 *            the reduce function.
 * @param mapTasks
 *            the number of map tasks to use.
 * @param reduceTasks
 *            the number of reduce tasks to use.
 * @param input
 *            a list of inputs
 * @param output
 *            the path of the output.
 * @return an indicator of success or failure.
 * @throws IOException
 */
public boolean launchPig(POMapreduce pom) throws IOException {
    JobConf conf = new JobConf(config);
    setJobProperties(conf, pom);
    Properties properties = pom.pigContext.getProperties();
    ConfigurationValidator.validatePigProperties(properties);
    String jobName = properties.getProperty(PigContext.JOB_NAME);
    conf.setJobName(jobName);
    boolean success = false;
    List<String> funcs = new ArrayList<String>();

    if (pom.toMap != null) {
        for (EvalSpec es : pom.toMap)
            funcs.addAll(es.getFuncs());
    }
    if (pom.groupFuncs != null) {
        for (EvalSpec es : pom.groupFuncs)
            funcs.addAll(es.getFuncs());
    }
    if (pom.toReduce != null) {
        funcs.addAll(pom.toReduce.getFuncs());
    }

    // create jobs.jar locally and pass it to hadoop
    File submitJarFile = File.createTempFile("Job", ".jar");
    try {
        FileOutputStream fos = new FileOutputStream(submitJarFile);
        JarManager.createJar(fos, funcs, null, pom.pigContext);
        log.debug("Job jar size = " + submitJarFile.length());
        conf.setJar(submitJarFile.getPath());
        String user = System.getProperty("user.name");
        conf.setUser(user != null ? user : "Pigster");

        conf.set("pig.spill.size.threshold", properties.getProperty("pig.spill.size.threshold"));
        conf.set("pig.spill.gc.activation.size", properties.getProperty("pig.spill.gc.activation.size"));

        if (pom.reduceParallelism != -1) {
            conf.setNumReduceTasks(pom.reduceParallelism);
        }
        if (pom.toMap != null) {
            conf.set("pig.mapFuncs", ObjectSerializer.serialize(pom.toMap));
        }
        if (pom.toCombine != null) {
            conf.set("pig.combineFunc", ObjectSerializer.serialize(pom.toCombine));
            // this is to make sure that combiner is only called once
            // since we can't handle no combine or multiple combines
            conf.setCombineOnceOnly(true);
        }
        if (pom.groupFuncs != null) {
            conf.set("pig.groupFuncs", ObjectSerializer.serialize(pom.groupFuncs));
        }
        if (pom.toReduce != null) {
            conf.set("pig.reduceFunc", ObjectSerializer.serialize(pom.toReduce));
        }
        if (pom.toSplit != null) {
            conf.set("pig.splitSpec", ObjectSerializer.serialize(pom.toSplit));
        }
        if (pom.pigContext != null) {
            conf.set("pig.pigContext", ObjectSerializer.serialize(pom.pigContext));
        }
        conf.setMapRunnerClass(PigMapReduce.class);
        if (pom.toCombine != null) {
            conf.setCombinerClass(PigCombine.class);
            //conf.setCombinerClass(PigMapReduce.class);
        }
        if (pom.quantilesFile != null) {
            conf.set("pig.quantilesFile", pom.quantilesFile);
        } else {
            // this is not a sort job - can use byte comparison to speed up processing
            conf.setOutputKeyComparatorClass(PigWritableComparator.class);
        }
        if (pom.partitionFunction != null) {
            conf.setPartitionerClass(SortPartitioner.class);
        }
        conf.setReducerClass(PigMapReduce.class);
        conf.setInputFormat(PigInputFormat.class);
        conf.setOutputFormat(PigOutputFormat.class);
        // not used starting with 0.15 conf.setInputKeyClass(Text.class);
        // not used starting with 0.15 conf.setInputValueClass(Tuple.class);
        conf.setOutputKeyClass(Tuple.class);
        if (pom.userComparator != null) {
            conf.setOutputKeyComparatorClass(pom.userComparator);
        }
        conf.setOutputValueClass(IndexedTuple.class);
        conf.set("pig.inputs", ObjectSerializer.serialize(pom.inputFileSpecs));

        conf.setOutputPath(new Path(pom.outputFileSpec.getFileName()));
        conf.set("pig.storeFunc", ObjectSerializer.serialize(pom.outputFileSpec.getFuncSpec()));

        // Setup the DistributedCache for this job
        setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.ship.files", true);
        setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.cache.files", false);

        // Setup the logs directory for this job
        String jobOutputFileName = pom.pigContext.getJobOutputFile();
        if (jobOutputFileName != null && jobOutputFileName.length() > 0) {
            Path jobOutputFile = new Path(pom.pigContext.getJobOutputFile());
            conf.set("pig.output.dir", jobOutputFile.getParent().toString());
            conf.set("pig.streaming.log.dir", new Path(jobOutputFile, LOG_DIR).toString());
        }

        //
        // Now, actually submit the job (using the submit name)
        //
        JobClient jobClient = execEngine.getJobClient();
        RunningJob status = jobClient.submitJob(conf);
        log.debug("submitted job: " + status.getJobID());

        long sleepTime = 1000;
        double lastQueryProgress = -1.0;
        int lastJobsQueued = -1;
        double lastMapProgress = -1.0;
        double lastReduceProgress = -1.0;
        while (true) {
            try {
                Thread.sleep(sleepTime);
            } catch (Exception e) {
            }

            if (status.isComplete()) {
                success = status.isSuccessful();
                if (log.isDebugEnabled()) {
                    StringBuilder sb = new StringBuilder();
                    sb.append("Job finished ");
                    sb.append((success ? "" : "un"));
                    sb.append("successfully");
                    log.debug(sb.toString());
                }
                if (success) {
                    mrJobNumber++;
                }
                double queryProgress = ((double) mrJobNumber) / ((double) numMRJobs);
                if (queryProgress > lastQueryProgress) {
                    if (log.isInfoEnabled()) {
                        StringBuilder sbProgress = new StringBuilder();
                        sbProgress.append("Pig progress = ");
                        sbProgress.append(((int) (queryProgress * 100)));
                        sbProgress.append("%");
                        log.info(sbProgress.toString());
                    }
                    lastQueryProgress = queryProgress;
                }
                break;
            } else // still running
            {
                double mapProgress = status.mapProgress();
                double reduceProgress = status.reduceProgress();
                if (lastMapProgress != mapProgress || lastReduceProgress != reduceProgress) {
                    if (log.isDebugEnabled()) {
                        StringBuilder sbProgress = new StringBuilder();
                        sbProgress.append("Hadoop job progress: Map=");
                        sbProgress.append((int) (mapProgress * 100));
                        sbProgress.append("% Reduce=");
                        sbProgress.append((int) (reduceProgress * 100));
                        sbProgress.append("%");
                        log.debug(sbProgress.toString());
                    }
                    lastMapProgress = mapProgress;
                    lastReduceProgress = reduceProgress;
                }
                double numJobsCompleted = mrJobNumber;
                double thisJobProgress = (mapProgress + reduceProgress) / 2.0;
                double queryProgress = (numJobsCompleted + thisJobProgress) / ((double) numMRJobs);
                if (queryProgress > lastQueryProgress) {
                    if (log.isInfoEnabled()) {
                        StringBuilder sbProgress = new StringBuilder();
                        sbProgress.append("Pig progress = ");
                        sbProgress.append(((int) (queryProgress * 100)));
                        sbProgress.append("%");
                        log.info(sbProgress.toString());
                    }
                    lastQueryProgress = queryProgress;
                }
            }
        }

        // bug 1030028: if the input file is empty; hadoop doesn't create the output file!
        Path outputFile = conf.getOutputPath();
        String outputName = outputFile.getName();
        int colon = outputName.indexOf(':');
        if (colon != -1) {
            outputFile = new Path(outputFile.getParent(), outputName.substring(0, colon));
        }

        try {
            ElementDescriptor descriptor = ((HDataStorage) (pom.pigContext.getDfs()))
                    .asElement(outputFile.toString());

            if (success && !descriptor.exists()) {

                // create an empty output file
                PigFile f = new PigFile(outputFile.toString(), false);
                f.store(BagFactory.getInstance().newDefaultBag(), new PigStorage(), pom.pigContext);
            }
        } catch (DataStorageException e) {
            throw WrappedIOException.wrap("Failed to obtain descriptor for " + outputFile.toString(), e);
        }

        if (!success) {
            // go find the error messages
            getErrorMessages(jobClient.getMapTaskReports(status.getJobID()), "map");
            getErrorMessages(jobClient.getReduceTaskReports(status.getJobID()), "reduce");
        } else {
            long timeSpent = 0;

            // NOTE: this call is crashing due to a bug in Hadoop; the bug is known and the patch has not been applied yet.
            TaskReport[] mapReports = jobClient.getMapTaskReports(status.getJobID());
            TaskReport[] reduceReports = jobClient.getReduceTaskReports(status.getJobID());
            for (TaskReport r : mapReports) {
                timeSpent += (r.getFinishTime() - r.getStartTime());
            }
            for (TaskReport r : reduceReports) {
                timeSpent += (r.getFinishTime() - r.getStartTime());
            }
            totalHadoopTimeSpent += timeSpent;
        }
    } catch (Exception e) {
        // Do we need different handling for different exceptions
        e.printStackTrace();
        throw WrappedIOException.wrap(e);
    } finally {
        submitJarFile.delete();
    }
    return success;
}

From source file:org.apache.sysml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java

License:Apache License

@SuppressWarnings({ "unused", "deprecation" })
protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi,
        long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-RMMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job = new JobConf(ResultMergeRemoteMR.class);
    job.setJobName(jobname + _pfid);/*  ww w.ja va 2  s  .c om*/

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    //warning for textcell/binarycell without compare
    boolean withCompare = (fname != null);
    if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare
            && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
        LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi)
                + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");

    try {
        Path pathCompare = null;
        Path pathNew = new Path(fnameNew);

        /////
        //configure the MR job
        if (withCompare) {
            FileSystem fs = IOUtilFunctions.getFileSystem(pathNew, job);
            pathCompare = new Path(fname).makeQualified(fs);
            MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen,
                    bclen);
        } else
            MRJobConfiguration.setResultMergeInfo(job, "null", ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen,
                    bclen);

        //set mappers, reducers, combiners
        job.setMapperClass(ResultMergeRemoteMapper.class);
        job.setReducerClass(ResultMergeRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            //setup partitioning, grouping, sorting for composite key (old API)
            job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning
            job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping
            job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting

            job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixBlock.class);
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path 
        Path[] paths = null;
        if (withCompare) {
            paths = new Path[srcFnames.length + 1];
            paths[0] = pathCompare;
            for (int i = 1; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i - 1]);
        } else {
            paths = new Path[srcFnames.length];
            for (int i = 0; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i]);
        }
        FileInputFormat.setInputPaths(job, paths);

        //set output format
        job.setOutputFormat(oi.outputFormatClass);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        FileOutputFormat.setOutputPath(job, pathNew);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = _numReducers;
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
        else //textcell/binarycell
            reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //set up custom map/reduce configurations 
        DMLConfig config = ConfigurationManager.getDMLConfig();
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS, "true");
        //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, _replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   

        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}