Example usage for org.apache.hadoop.mapred JobConf setMapRunnerClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapRunnerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapRunnerClass.

Prototype

public void setMapRunnerClass(Class<? extends MapRunnable> theClass) 

Source Link

Document

Expert: Set the MapRunnable class for the job.

Usage

From source file:it.crs4.pydoop.pipes.Submitter.java

License:Apache License

private static void setupPipesJob(JobConf conf) throws IOException {
    // default map output types to Text
    if (!getIsJavaMapper(conf)) {
        conf.setMapRunnerClass(PipesMapRunner.class);
        // Save the user's partitioner and hook in our's.
        setJavaPartitioner(conf, conf.getPartitionerClass());
        conf.setPartitionerClass(PipesPartitioner.class);
    }//from w  ww .jav a2  s.c  om
    if (!getIsJavaReducer(conf)) {
        conf.setReducerClass(PipesReducer.class);
        if (!getIsJavaRecordWriter(conf)) {
            conf.setOutputFormat(NullOutputFormat.class);
        }
    }
    String textClassname = Text.class.getName();
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

    // Use PipesNonJavaInputFormat if necessary to handle progress reporting
    // from C++ RecordReaders ...
    if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
        conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class);
        conf.setInputFormat(PipesNonJavaInputFormat.class);
    }

    String exec = getExecutable(conf);
    if (exec == null) {
        throw new IllegalArgumentException("No application program defined.");
    }
    // add default debug script only when executable is expressed as
    // <path>#<executable>
    if (exec.contains("#")) {
        // set default gdb commands for map and reduce task 
        String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
        setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
        setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
    }
    URI[] fileCache = DistributedCache.getCacheFiles(conf);
    if (fileCache == null) {
        fileCache = new URI[1];
    } else {
        URI[] tmp = new URI[fileCache.length + 1];
        System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
        fileCache = tmp;
    }
    try {
        fileCache[0] = new URI(exec);
    } catch (URISyntaxException e) {
        IOException ie = new IOException("Problem parsing execable URI " + exec);
        ie.initCause(e);
        throw ie;
    }
    DistributedCache.setCacheFiles(fileCache, conf);
}

From source file:ivory.index.BuildIntPostingsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String indexPath = conf.get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildIntPostingsForwardIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);

    conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
        sLogger.info("Postings forward index path already exists!");
        return 0;
    }//from  ww  w  .  j  a v  a  2 s  .c  o m
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();
    boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false);

    sLogger.info("Tool: BuildIntDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - BuildWeighted: " + buildWeighted);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    String intDocVectorsPath;//from w w  w.j a  v a  2 s  .c  o m
    String forwardIndexPath;
    if (buildWeighted) {
        intDocVectorsPath = env.getWeightedIntDocVectorsDirectory();
        forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex();
    } else {
        intDocVectorsPath = env.getIntDocVectorsDirectory();
        forwardIndexPath = env.getIntDocVectorsForwardIndex();
    }

    if (!fs.exists(new Path(intDocVectorsPath))) {
        sLogger.info("Error: IntDocVectors don't exist!");
        return 0;
    }

    if (fs.exists(new Path(forwardIndexPath))) {
        sLogger.info("IntDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(intDocVectorsPath);
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildTermDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) {
        sLogger.info("Error: TermDocVectors don't exist!");
        return 0;
    }/*ww w.jav  a2  s.co  m*/

    if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) {
        sLogger.info("TermDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:net.peacesoft.nutch.crawl.ReFetcher.java

License:Apache License

public void fetch(Path segment, int threads) throws IOException {

    checkConfiguration();//from www . ja  va2 s  .c om

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("ReFetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(ReFetcher.InputFormat.class);

    job.setMapRunnerClass(ReFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:net.sf.katta.indexing.IndexerJob.java

License:Apache License

public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException {
    // create job conf with class pointing into job jar.
    JobConf jobConf = new JobConf(IndexerJob.class);
    jobConf.setJobName("indexer");
    jobConf.setMapRunnerClass(Indexer.class);
    // alternative use a text file and a TextInputFormat
    jobConf.setInputFormat(SequenceFileInputFormat.class);

    Path input = new Path(path);
    FileInputFormat.setInputPaths(jobConf, input);
    // we just set the output path to make hadoop happy.
    FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination));
    // setting the folder where lucene indexes will be copied when finished.
    jobConf.set("finalDestination", finalDestination);
    // important to switch spec exec off.
    // We dont want to have something duplicated.
    jobConf.setSpeculativeExecution(false);

    // The num of map tasks is equal to the num of input splits.
    // The num of input splits by default is equal to the num of hdf blocks
    // for the input file(s). To get the right num of shards we need to
    // calculate the best input split size.

    FileSystem fs = FileSystem.get(input.toUri(), jobConf);
    FileStatus[] status = fs.globStatus(input);
    long size = 0;
    for (FileStatus fileStatus : status) {
        size += fileStatus.getLen();/*w  w  w  . j av a2s .  c o  m*/
    }
    long optimalSplisize = size / numOfShards;
    jobConf.set("mapred.min.split.size", "" + optimalSplisize);

    // give more mem to lucene tasks.
    jobConf.set("mapred.child.java.opts", "-Xmx2G");
    jobConf.setNumMapTasks(1);
    jobConf.setNumReduceTasks(0);
    JobClient.runJob(jobConf);
}

From source file:org.apache.avro.mapred.tether.TetherJob.java

License:Apache License

private static void setupTetherJob(JobConf job) throws IOException {
    job.setMapRunnerClass(TetherMapRunner.class);
    job.setPartitionerClass(TetherPartitioner.class);
    job.setReducerClass(TetherReducer.class);

    job.setInputFormat(TetherInputFormat.class);
    job.setOutputFormat(TetherOutputFormat.class);

    job.setOutputKeyClass(TetherData.class);
    job.setOutputKeyComparatorClass(TetherKeyComparator.class);
    job.setMapOutputValueClass(NullWritable.class);

    // set the map output key class to TetherData
    job.setMapOutputKeyClass(TetherData.class);

    // add TetherKeySerialization to io.serializations
    Collection<String> serializations = job.getStringCollection("io.serializations");
    if (!serializations.contains(TetherKeySerialization.class.getName())) {
        serializations.add(TetherKeySerialization.class.getName());
        job.setStrings("io.serializations", serializations.toArray(new String[0]));
    }/*from w ww .  ja va 2s  .  c  o  m*/

    // determine whether the executable should be added to the cache.
    if (job.getBoolean(TETHER_EXEC_CACHED, false)) {
        DistributedCache.addCacheFile(getExecutable(job), job);
    }
}

From source file:org.apache.nutch.fetcher.Fetcher.java

License:Apache License

public void fetch(Path segment, int threads) throws IOException {

    checkConfiguration();// ww  w .  ja  v  a2 s .  c o m

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(Fetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.fetcher.Fetcher2.java

License:Apache License

public void fetch(Path segment, int threads, boolean parsing) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting");
        LOG.info("Fetcher: segment: " + segment);
    }/* www.  j av  a2 s .c om*/

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
    job.setBoolean("fetcher.parse", parsing);

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(Fetcher2.class);

    job.setOutputPath(segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FetcherOutput.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: done");
    }
}

From source file:org.apache.nutch.fetcher.NIOFetcher.java

License:Apache License

public void fetch(Path segment) throws IOException {

    checkConfiguration();/*  ww  w.j  a  va  2 s . c o m*/

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("NIOFetcher: starting at " + sdf.format(start));
        LOG.info("NIOFetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is
    // enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("NIOFetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("NIOFetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(FetcherInputFormat.class);

    job.setMapRunnerClass(NIOFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("NIOFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}