Example usage for org.apache.hadoop.mapreduce MRJobConfig NUM_MAPS

List of usage examples for org.apache.hadoop.mapreduce MRJobConfig NUM_MAPS

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce MRJobConfig NUM_MAPS.

Prototype

String NUM_MAPS

To view the source code for org.apache.hadoop.mapreduce MRJobConfig NUM_MAPS.

Click Source Link

Usage

From source file:alluxio.checker.MapReduceIntegrationChecker.java

License:Apache License

/**
 * Implements MapReduce with Alluxio integration checker.
 *
 * @return 0 for success, 2 for unable to find Alluxio classes, 1 otherwise
 *///from   w  w w.  j  a va  2 s  .co  m
private int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String numMaps = new GenericOptionsParser(conf, args).getRemainingArgs()[0];
    conf.set(MRJobConfig.NUM_MAPS, numMaps);
    createHdfsFilesystem(conf);

    Job job = Job.getInstance(conf, "MapReduceIntegrationChecker");
    job.setJarByClass(MapReduceIntegrationChecker.class);
    job.setMapperClass(CheckerMapper.class);
    job.setCombinerClass(CheckerReducer.class);
    job.setReducerClass(CheckerReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(EmptyInputFormat.class);
    FileOutputFormat.setOutputPath(job, mOutputFilePath);

    try {
        if (!job.waitForCompletion(true)) {
            return 1;
        }
        Status resultStatus = generateReport();
        return resultStatus.equals(Status.SUCCESS) ? 0
                : (resultStatus.equals(Status.FAIL_TO_FIND_CLASS) ? 2 : 1);
    } finally {
        if (mFileSystem.exists(mOutputFilePath)) {
            mFileSystem.delete(mOutputFilePath, true);
        }
        mFileSystem.close();
    }
}

From source file:co.cask.hydrator.plugin.db.batch.source.DBSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    sourceConfig.substituteMacros(context);
    LOG.debug(//from   ww w . ja va2 s .  co m
            "pluginType = {}; pluginName = {}; connectionString = {}; importQuery = {}; "
                    + "boundingQuery = {}",
            sourceConfig.jdbcPluginType, sourceConfig.jdbcPluginName, sourceConfig.connectionString,
            sourceConfig.getImportQuery(), sourceConfig.getBoundingQuery());
    Configuration hConf = new Configuration();
    hConf.clear();

    // Load the plugin class to make sure it is available.
    Class<? extends Driver> driverClass = context.loadPluginClass(getJDBCPluginId());
    if (sourceConfig.user == null && sourceConfig.password == null) {
        DBConfiguration.configureDB(hConf, driverClass.getName(), sourceConfig.connectionString);
    } else {
        DBConfiguration.configureDB(hConf, driverClass.getName(), sourceConfig.connectionString,
                sourceConfig.user, sourceConfig.password);
    }
    DataDrivenETLDBInputFormat.setInput(hConf, DBRecord.class, sourceConfig.getImportQuery(),
            sourceConfig.getBoundingQuery(), sourceConfig.getEnableAutoCommit());
    if (sourceConfig.numSplits == null || sourceConfig.numSplits != 1) {
        hConf.set(DBConfiguration.INPUT_ORDER_BY_PROPERTY, sourceConfig.splitBy);
    }
    if (sourceConfig.numSplits != null) {
        hConf.setInt(MRJobConfig.NUM_MAPS, sourceConfig.numSplits);
    }
    context.setInput(Input.of(sourceConfig.referenceName,
            new SourceInputFormatProvider(DataDrivenETLDBInputFormat.class, hConf)));
}

From source file:co.nubetech.apache.hadoop.mapred.DataDrivenDBInputFormat.java

License:Apache License

/** {@inheritDoc} */
public List<InputSplit> getSplits(Configuration job) throws IOException {

    int targetNumTasks = job.getInt(MRJobConfig.NUM_MAPS, 1);
    if (1 == targetNumTasks) {
        // There's no need to run a bounding vals query; just return a split
        // that separates nothing. This can be considerably more optimal for
        // a//from ww  w  .ja  v  a2 s . co  m
        // large table with no index.
        List<InputSplit> singletonSplit = new ArrayList<InputSplit>();
        singletonSplit.add(
                new org.apache.hadoop.mapreduce.lib.db.DataDrivenDBInputFormat.DataDrivenDBInputSplit("1=1",
                        "1=1"));
        return singletonSplit;
    }

    ResultSet results = null;
    Statement statement = null;
    Connection connection = getConnection();
    try {
        statement = connection.createStatement();

        results = statement.executeQuery(getBoundingValsQuery());
        results.next();

        // Based on the type of the results, use a different mechanism
        // for interpolating split points (i.e., numeric splits, text
        // splits,
        // dates, etc.)
        int sqlDataType = results.getMetaData().getColumnType(1);
        DBSplitter splitter = getSplitter(sqlDataType);
        if (null == splitter) {
            throw new IOException("Unknown SQL data type: " + sqlDataType);
        }

        //return convertSplit(splitter.split(job, results, getDBConf()
        //      .getInputOrderBy()));
        return splitter.split(job, results, getDBConf().getInputOrderBy());
    } catch (SQLException e) {
        throw new IOException(e.getMessage());
    } finally {
        // More-or-less ignore SQL exceptions here, but log in case we need
        // it.
        try {
            if (null != results) {
                results.close();
            }
        } catch (SQLException se) {
            LOG.debug("SQLException closing resultset: " + se.toString());
        }

        try {
            if (null != statement) {
                statement.close();
            }
        } catch (SQLException se) {
            LOG.debug("SQLException closing statement: " + se.toString());
        }

        try {
            connection.commit();
            closeConnection();
        } catch (SQLException se) {
            LOG.debug("SQLException committing split transaction: " + se.toString());
        }
    }
}

From source file:com.hortonworks.pso.data.generator.mapreduce.DataGenInputFormat.java

License:Apache License

/**
 * Create the desired number of splits, dividing the number of rows
 * between the mappers./*w w w .j  a  v a2 s  . c o  m*/
 */
public List<InputSplit> getSplits(JobContext job) {
    long totalRows = getNumberOfRows(job);
    int numSplits = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
    LOG.info("Generating " + totalRows + " using " + numSplits);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    long currentRow = 0;
    for (int split = 0; split < numSplits; ++split) {
        long goal = (long) Math.ceil(totalRows * (double) (split + 1) / numSplits);
        splits.add(new DataGenInputSplit(currentRow, goal - currentRow));
        currentRow = goal;
    }
    return splits;
}

From source file:com.msd.gin.halyard.tools.HalyardParallelExport.java

License:Apache License

private static void printHelp(Options options) {
    new HelpFormatter().printHelp(100, "pexport",
            "Exports graph or table data from Halyard RDF store, using parallalel SPARQL query", options,
            "Example: pexport [-D" + MRJobConfig.NUM_MAPS + "=10] [-D" + MRJobConfig.QUEUE_NAME
                    + "=proofofconcepts] -s my_dataset -q '\nPREFIX hlyd: <http://gin.msd.com/halyard/>\nselect * where {?s ?p ?o .\nFILTER (hlyd:parallel_split_by (?s))}' -t hdfs:/my_folder/my_data{0}.csv.gz",
            true);/*from w ww. j  av  a  2s.c om*/
}

From source file:com.phantom.hadoop.examples.dancing.DistributedPentomino.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    if (args.length == 0) {
        System.out.println("Usage: pentomino <output> [-depth #] [-height #] [-width #]");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }// ww  w  .j  a  va  2s  . c o  m
    // check for passed parameters, otherwise use defaults
    int width = conf.getInt(Pentomino.WIDTH, PENT_WIDTH);
    int height = conf.getInt(Pentomino.HEIGHT, PENT_HEIGHT);
    int depth = conf.getInt(Pentomino.DEPTH, PENT_DEPTH);
    for (int i = 0; i < args.length; i++) {
        if (args[i].equalsIgnoreCase("-depth")) {
            depth = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-height")) {
            height = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-width")) {
            width = Integer.parseInt(args[++i].trim());
        }
    }
    // now set the values within conf for M/R tasks to read, this
    // will ensure values are set preventing MAPREDUCE-4678
    conf.setInt(Pentomino.WIDTH, width);
    conf.setInt(Pentomino.HEIGHT, height);
    conf.setInt(Pentomino.DEPTH, depth);
    Class<? extends Pentomino> pentClass = conf.getClass(Pentomino.CLASS, OneSidedPentomino.class,
            Pentomino.class);
    int numMaps = conf.getInt(MRJobConfig.NUM_MAPS, DEFAULT_MAPS);
    Path output = new Path(args[0]);
    Path input = new Path(output + "_input");
    FileSystem fileSys = FileSystem.get(conf);
    try {
        Job job = new Job(conf);
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);
        job.setJarByClass(PentMap.class);

        job.setJobName("dancingElephant");
        Pentomino pent = ReflectionUtils.newInstance(pentClass, conf);
        pent.initialize(width, height);
        long inputSize = createInputDirectory(fileSys, input, pent, depth);
        // for forcing the number of maps
        FileInputFormat.setMaxInputSplitSize(job, (inputSize / numMaps));

        // the keys are the prefix strings
        job.setOutputKeyClass(Text.class);
        // the values are puzzle solutions
        job.setOutputValueClass(Text.class);

        job.setMapperClass(PentMap.class);
        job.setReducerClass(Reducer.class);

        job.setNumReduceTasks(1);

        return (job.waitForCompletion(true) ? 0 : 1);
    } finally {
        fileSys.delete(input, true);
    }
}

From source file:com.phantom.hadoop.examples.RandomTextWriter.java

License:Apache License

/**
 * This is the main routine for launching a distributed random write job. It
 * runs 10 maps/node and each node writes 1 gig of data to a DFS file. The
 * reduce doesn't do anything.// w ww  .ja v  a 2s.c  om
 * 
 * @throws IOException
 */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        return printUsage();
    }

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have " + BYTES_PER_MAP + " set to 0");
        return -2;
    }
    long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
    }
    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

    Job job = new Job(conf);

    job.setJarByClass(RandomTextWriter.class);
    job.setJobName("random-text-writer");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
    job.setMapperClass(RandomTextMapper.class);

    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));

    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return ret;
}

From source file:com.phantom.hadoop.examples.RandomWriter.java

License:Apache License

/**
 * This is the main routine for launching a distributed random write job. It
 * runs 10 maps/node and each node writes 1 gig of data to a DFS file. The
 * reduce doesn't do anything./* w ww . j  a v a 2s. com*/
 * 
 * @throws IOException
 */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        System.out.println("Usage: writer <out-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }

    Path outDir = new Path(args[0]);
    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
        return -2;
    }
    long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
    }
    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

    Job job = new Job(conf);

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setInputFormatClass(RandomInputFormat.class);
    job.setMapperClass(RandomMapper.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return ret;
}

From source file:org.apache.hadoop.examples.dancing.DistributedPentomino.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    if (args.length == 0) {
        System.out.println("Usage: pentomino <output> [-depth #] [-height #] [-width #]");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }//from w  ww  . j a  v a2  s.c o  m
    // check for passed parameters, otherwise use defaults
    int width = conf.getInt(Pentomino.WIDTH, PENT_WIDTH);
    int height = conf.getInt(Pentomino.HEIGHT, PENT_HEIGHT);
    int depth = conf.getInt(Pentomino.DEPTH, PENT_DEPTH);
    for (int i = 0; i < args.length; i++) {
        if (args[i].equalsIgnoreCase("-depth")) {
            depth = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-height")) {
            height = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-width")) {
            width = Integer.parseInt(args[++i].trim());
        }
    }
    // now set the values within conf for M/R tasks to read, this
    // will ensure values are set preventing MAPREDUCE-4678
    conf.setInt(Pentomino.WIDTH, width);
    conf.setInt(Pentomino.HEIGHT, height);
    conf.setInt(Pentomino.DEPTH, depth);
    Class<? extends Pentomino> pentClass = conf.getClass(Pentomino.CLASS, OneSidedPentomino.class,
            Pentomino.class);
    int numMaps = conf.getInt(MRJobConfig.NUM_MAPS, DEFAULT_MAPS);
    Path output = new Path(args[0]);
    Path input = new Path(output + "_input");
    FileSystem fileSys = FileSystem.get(conf);
    try {
        Job job = Job.getInstance(conf);
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);
        job.setJarByClass(PentMap.class);

        job.setJobName("dancingElephant");
        Pentomino pent = ReflectionUtils.newInstance(pentClass, conf);
        pent.initialize(width, height);
        long inputSize = createInputDirectory(fileSys, input, pent, depth);
        // for forcing the number of maps
        FileInputFormat.setMaxInputSplitSize(job, (inputSize / numMaps));

        // the keys are the prefix strings
        job.setOutputKeyClass(Text.class);
        // the values are puzzle solutions
        job.setOutputValueClass(Text.class);

        job.setMapperClass(PentMap.class);
        job.setReducerClass(Reducer.class);

        job.setNumReduceTasks(1);

        return (job.waitForCompletion(true) ? 0 : 1);
    } finally {
        fileSys.delete(input, true);
    }
}

From source file:org.apache.hadoop.examples.RandomTextWriter.java

License:Apache License

/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.//from  w w  w .j ava2s  . c  om
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        return printUsage();
    }

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have " + BYTES_PER_MAP + " set to 0");
        return -2;
    }
    long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
    }
    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

    Job job = Job.getInstance(conf);

    job.setJarByClass(RandomTextWriter.class);
    job.setJobName("random-text-writer");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
    job.setMapperClass(RandomTextMapper.class);

    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));

    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return ret;
}