Example usage for org.apache.hadoop.mapred JobConf setNumMapTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumMapTasks.

Prototype

public void setNumMapTasks(int n)

Source Link

Document

Set the number of map tasks for this job.

Usage

From source file:org.apache.sysml.runtime.matrix.CleanupMR.java

License:Apache License

public static boolean runJob(DMLConfig conf) throws Exception {
    boolean ret = false;

    try {//from  w  ww .j  a v a 2  s .c  o  m
        JobConf job;
        job = new JobConf(CleanupMR.class);
        job.setJobName("Cleanup-MR");

        //set up SystemML local tmp dir
        String dir = conf.getTextValue(DMLConfig.LOCAL_TMP_DIR);
        MRJobConfiguration.setSystemMLLocalTmpDir(job, dir);

        //set mappers, reducers 
        int numNodes = InfrastructureAnalyzer.getRemoteParallelNodes();
        job.setMapperClass(CleanupMapper.class); //map-only
        job.setNumMapTasks(numNodes); //numMappers
        job.setNumReduceTasks(0);

        //set input/output format, input path
        String inFileName = conf.getTextValue(DMLConfig.SCRATCH_SPACE) + "/cleanup_tasks";
        job.setInputFormat(NLineInputFormat.class);
        job.setOutputFormat(NullOutputFormat.class);

        Path path = new Path(inFileName);
        FileInputFormat.setInputPaths(job, path);
        writeCleanupTasksToFile(path, numNodes);

        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        ret = runjob.isSuccessful();
    } catch (Exception ex) {
        //don't raise an exception, just gracefully an error message.
        LOG.error("Failed to run cleanup MR job. ", ex);
    }

    return ret;
}

From source file:org.apache.sysml.runtime.matrix.DataGenMR.java

License:Apache License

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 * /*from   w  w w .  j  a  v a 2 s . c  o m*/
 * @param inst MR job instruction
 * @param dataGenInstructions array of data gen instructions
 * @param instructionsInMapper instructions in mapper
 * @param aggInstructionsInReducer aggregate instructions in reducer
 * @param otherInstructionsInReducer other instructions in reducer
 * @param numReducers number of reducers
 * @param replication file replication
 * @param resultIndexes result indexes for each random object
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if Exception occurs
 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();

        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);

        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));

                //for obj reuse and preventing repeated buffer re-allocations
                StringBuilder sb = new StringBuilder();

                //seed generation
                Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
                LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i],
                        randInst.getSparsity());
                PrimitiveIterator.OfLong nnzIter = nnz.iterator();
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    for (long c = 0; c < clens[i]; c += bclens[i]) {
                        long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

                        sb.append((r / brlens[i]) + 1);
                        sb.append(',');
                        sb.append((c / bclens[i]) + 1);
                        sb.append(',');
                        sb.append(curBlockRowSize);
                        sb.append(',');
                        sb.append(curBlockColSize);
                        sb.append(',');
                        sb.append(nnzIter.nextLong());
                        sb.append(',');
                        sb.append(bigrand.nextLong());
                        pw.println(sb.toString());
                        sb.setLength(0);
                        numblocks++;
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            maxsparsity = 1.0; //always dense

            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;

            //handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

            // Compute the number of rows in the sequence
            long numrows = UtilFunctions.getSeqLength(from, to, incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException(
                            "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                                    + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }

            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException(
                        "Unexpected error while processing sequence instruction. Number of columns (" + clens[i]
                                + ") must be equal to 1.");
            else
                clens[i] = 1;

            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                StringBuilder sb = new StringBuilder();

                double temp = from;
                double block_from, block_to;
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

                    // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                    long bid_i = ((r / brlens[i]) + 1);
                    long bid_j = 1;
                    block_from = temp;
                    block_to = temp + (curBlockRowSize - 1) * incr;
                    temp = block_to + incr; // next block starts from here

                    sb.append(bid_i);
                    sb.append(',');
                    sb.append(bid_j);
                    sb.append(',');
                    sb.append(block_from);
                    sb.append(',');
                    sb.append(block_to);
                    sb.append(',');
                    sb.append(incr);

                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    dataGenInsStr = dataGenInsStr.substring(1);//remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
                ConvertTarget.BLOCK);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math
                .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer,
                resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null,
                otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }

        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform")
                || instructionsInMapper.contains("groupedagg");

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos,
                true, mayContainCtable);

        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }

        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);

        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
        for (String input : inputs)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:org.apache.tez.mapreduce.examples.MapredWordCount.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker./*from w w  w.  jav a  2 s .co  m*/
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), MapredWordCount.class);
    conf.setJobName("wordcount");
    LOG.info("Running WordCount job using mapred apis");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            LOG.error("Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            LOG.error("Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        LOG.error("Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:org.archive.hadoop.jobs.ArchiveFileExtractor.java

License:Apache License

/**
* Run the job./*from w  ww  .  j ava 2 s .  c  o  m*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        printUsage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("Archive File Extractor");

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // turn off speculative execution
    job.setBoolean("mapred.map.tasks.speculative.execution", false);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    //tolerate task exceptions
    job.setBoolean("soft", false);

    int arg = 0;
    int numMaps = 10;

    String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n"
            + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"
            + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n";

    String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION,
            DateUtils.getLog17Date(System.currentTimeMillis()));

    while (arg < args.length - 1) {
        if (args[arg].equals("-soft")) {
            job.setBoolean("soft", true);
            arg++;
        } else if (args[arg].equals("-mappers")) {
            arg++;
            numMaps = Integer.parseInt(args[arg]);
            job.setNumMapTasks(numMaps);
            arg++;
        } else if (args[arg].equals("-timestamp14")) {
            arg++;
            String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg]));
            job.set("timestamp14", timestamp14);
            arg++;
        } else if (args[arg].equals("-warc-header-local-file")) {
            arg++;
            File f = new File(args[arg]);
            FileInputStream fis = new FileInputStream(f);
            warcHeaderString = IOUtils.toString(fis, "UTF-8");
            arg++;
        } else if (args[arg].equals("-hmacname")) {
            arg++;
            String hmacName = args[arg];
            job.set("hmacName", hmacName);
            arg++;
        } else if (args[arg].equals("-hmacsignature")) {
            arg++;
            String hmacSignature = args[arg];
            job.set("hmacSignature", hmacSignature);
            arg++;
        } else if (args[arg].equals("-timeout")) {
            arg++;
            int taskTimeout = Integer.parseInt(args[arg]);
            job.setInt("mapred.task.timeout", taskTimeout);
            arg++;
        } else if (args[arg].equals("-failpct")) {
            arg++;
            int failPct = Integer.parseInt(args[arg]);
            job.setInt("mapred.max.map.failures.percent", failPct);
            arg++;
        } else {
            break;
        }
    }

    job.set("warcHeaderString", warcHeaderString);

    if (args.length - 2 != arg) {
        printUsage();
        return 1;
    }

    Path inputPath = new Path(args[arg]);
    arg++;

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    Path outputPath = new Path(outputDir);

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(ArchiveFileExtractorMapper.class);
    job.setJarByClass(ArchiveFileExtractor.class);

    TextInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.wayback.hadoop.CDXSort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job.// w  ww .  j a v a  2 s  . c  om
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    boolean compressOutput = false;
    boolean dereferenceInputs = false;
    boolean canonicalize = false;
    boolean funkyInput = false;

    JobConf jobConf = new JobConf(getConf(), CDXSort.class);
    jobConf.setJobName("cdxsort");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    List<String> otherArgs = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("--compress-output".equals(args[i])) {
                compressOutput = true;
            } else if ("--funky-input".equals(args[i])) {
                funkyInput = true;
            } else if ("--dereference-inputs".equals(args[i])) {
                dereferenceInputs = true;
            } else if ("--canonicalize".equals(args[i])) {
                canonicalize = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 3 parameters left: split input output
    if (otherArgs.size() != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
        return printUsage();
    }

    String splitPath = otherArgs.get(0);
    String inputPath = otherArgs.get(1);
    String outputPath = otherArgs.get(2);

    // load the split file, find and set the number of reduces
    AlphaPartitioner partitioner = new AlphaPartitioner();
    File localSplitFile = new File(splitPath);
    FileInputStream fis = new FileInputStream(localSplitFile);
    InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8);
    BufferedReader bis = new BufferedReader(isr);
    //      try {
    //         partitioner.loadBoundaries(bis);
    //      } catch (IOException except) {
    //         System.err.println("ERROR: Problem loading file " + splitPath);
    //         return printUsage(); // exits
    //      }
    //      jobConf.setNumReduceTasks(partitioner.getNumPartitions());
    //
    //      // copy the split file into the FS, add to the DistributedCache:
    ////      AlphaPartitioner.setPartitionFile(jobConf, localSplitFile);
    //      AlphaPartitioner.setSplitCache(jobConf, localSplitFile);
    //      System.err.println("uploaded split file to FS and DistributedCache");
    //
    //      // Set job configs:
    //      jobConf.setInputFormat(TextInputFormat.class);
    //
    //      jobConf.setOutputFormat(TextOutputFormat.class);
    //      if (canonicalize) {
    //         jobConf.setMapperClass(CDXCanonicalizerMapClass.class);
    //      } else {
    //         jobConf.setMapperClass(CDXMapClass.class);
    //      }
    //      jobConf.setOutputKeyClass(Text.class);
    //      jobConf.setOutputValueClass(Text.class);
    //      jobConf.set("mapred.textoutputformat.separator", " ");
    //      jobConf.setPartitionerClass(AlphaPartitioner.class);

    int inputCount = 0;
    // Set job input:
    if (dereferenceInputs) {

        // SO SLOW... can't add one at a time...
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            FileInputFormat.addInputPath(jobConf, new Path(line));
        //            inputCount++;
        //            System.err.println("Added path(" + inputCount + "): " + line);
        //         }

        // PASS 2:
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         ArrayList<String> list = new ArrayList<String>();
        //         
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            list.add(line);
        //            inputCount++;
        //         }
        //         Path arr[] = new Path[list.size()];
        //         for(int i=0; i < list.size(); i++) {
        //            arr[i] = new Path(list.get(i));
        //         }
        //         FileInputFormat.setInputPaths(jobConf, arr);

        // PASS 3:
        if (funkyInput) {
            jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class);
        } else {
            jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class);
        }
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;

    } else {
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;
    }

    // Set job output:
    FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));

    if (compressOutput) {
        FileOutputFormat.setCompressOutput(jobConf, true);
        FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class);
    }

    //      System.out.println("Running on " + cluster.getTaskTrackers()
    //            + " nodes, processing " + inputCount + " files/directories"
    //            + " into " + outputPath + " with "
    //            + partitioner.getNumPartitions() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:org.asayler.WikiTitleCount.java

License:Apache License

/**
 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//from   ww w . ja  v a 2 s.c  o m
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleCount.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    int num_reducers = 1;

    conf.setJobName("wikititlecount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    /** Set Default Mappers */
    num_reducers = (int) (cluster.getMaxReduceTasks() * 0.9);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            other_args.add(args[i]);
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */
    conf.setNumMapTasks(num_maps);
    conf.setNumReduceTasks(num_reducers);

    JobClient.runJob(conf);
    return 0;
}

From source file:org.asayler.WikiTitleSort.java

License:Apache License

/**
 * The main driver for wikititlecount map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//from   www. j  a  v  a 2  s. c  om
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WikiTitleSort.class);
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();

    int num_maps = 1;
    final int num_reducers = 1;

    conf.setJobName("wikititlesort");

    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    /** Set Default Mappers */
    num_maps = (int) (cluster.getMaxMapTasks());

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            other_args.add(args[i]);
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    /* Set Mappers and Reducer */
    conf.setNumMapTasks(num_maps);
    conf.setNumReduceTasks(num_reducers);

    JobClient.runJob(conf);
    return 0;
}

From source file:org.cloudata.core.PerformanceTest.java

License:Apache License

private void runNIsMoreThanOne(final String cmd) throws IOException {
    checkTable();/*  w w  w . j a  va  2s. c om*/

    // Run a mapreduce job.  Run as many maps as asked-for clients.
    // Before we start up the job, write out an input file with instruction
    // per client regards which row they are to start on.
    Path inputDir = writeInputFile(this.conf);
    this.conf.set(EvaluationMapTask.CMD_KEY, cmd);
    JobConf job = new JobConf(this.conf, this.getClass());
    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormat(TextInputFormat.class);
    job.setJobName("Cloudata Performance Evaluation");
    job.setMapperClass(EvaluationMapTask.class);
    job.setMaxMapAttempts(1);
    job.setMaxReduceAttempts(1);
    job.setNumMapTasks(this.N * 10); // Ten maps per client.
    job.setNumReduceTasks(1);
    job.setOutputFormat(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(inputDir, "outputs"));
    JobClient.runJob(job);
}

From source file:org.cloudata.core.testjob.performance.ManyTableJob.java

License:Apache License

public static Path putData() throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf jobConf = new JobConf(ManyTableJob.class);
    jobConf.set("user.name", nconf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("ManyTableJob_Put" + "(" + new Date() + ")");

    jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000);

    Path outputPath = new Path("ManyTableJob_KEY_" + System.currentTimeMillis());

    FileOutputFormat.setOutputPath(jobConf, outputPath);

    //<MAP>
    jobConf.setMapperClass(ManyTablePutMap.class);
    jobConf.setInputFormat(SimpleInputFormat.class);
    jobConf.setNumMapTasks(numOfTables);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);/*w ww  .  j a  va 2 s .c o  m*/
    //</MAP>

    //<REDUCE>
    jobConf.setNumReduceTasks(0);
    //</REDUCE>

    try {
        //Run Job
        JobClient.runJob(jobConf);
        return outputPath;
    } finally {
        //delete temp output path
        FileSystem fs = FileSystem.get(jobConf);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.core.testjob.performance.TestMultiThreadCTable.java

License:Apache License

public static Path putData(String outputDir) throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf jobConf = new JobConf(TestMultiThreadCTable.class);
    jobConf.set("user.name", nconf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("TestMultiThreadNTable_" + "(" + new Date() + ")");

    jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000);

    Path outputPath = new Path(outputDir);

    FileOutputFormat.setOutputPath(jobConf, outputPath);

    JobClient jobClient = new JobClient();

    int numOfRowPerMap = 100000 / jobClient.getClusterStatus().getMaxMapTasks();
    jobConf.setInt("numOfRowPerMap", numOfRowPerMap);
    //<MAP>
    jobConf.setMapperClass(PutDataMap.class);
    jobConf.setInputFormat(SimpleInputFormat.class);
    jobConf.setNumMapTasks(jobClient.getClusterStatus().getMaxMapTasks());
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);//from   ww w  . j a  v a2s  .co m
    //</MAP>

    //<REDUCE>
    jobConf.setNumReduceTasks(0);
    //</REDUCE>

    try {
        //Run Job
        JobClient.runJob(jobConf);
        return outputPath;
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}