Example usage for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobConf conf) throws IOException

Source Link

Usage

From source file:com.savy3.nonequijoin.MapOutputSampler.java

License:Apache License

/**
 * Driver for InputSampler from the command line. Configures a JobConf
 * instance and calls {@link #writePartitionFile}.
 *///from   w w w . j ava  2  s  .c o  m
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-inFormat".equals(args[i])) {
                job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
            } else if ("-keyClass".equals(args[i])) {
                job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
            } else if ("-splitSample".equals(args[i])) {
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new SplitSampler<K, V>(numSamples, maxSplits);
            } else if ("-splitRandom".equals(args[i])) {
                System.out.println("Random sampling");
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else if ("-splitInterval".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new IntervalSampler<K, V>(pcnt, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    if (job.getNumReduceTasks() <= 1) {
        System.err.println("Sampler requires more than one reducer");
        return printUsage();
    }
    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }
    if (null == sampler) {
        sampler = new RandomSampler<K, V>(0.1, 10000, 10);
    }
    System.out.println("before paths");
    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
        FileInputFormat.addInputPath(job, new Path(s));
    }
    MapOutputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
}

From source file:com.sematext.hbase.hut.TestHBaseHut.java

License:Apache License

public static void processUpdatesWithMrJob(Configuration configuration, HTable hTable, int mapBufferSize,
        long mapBufferSizeInBytes, int minRecordsToCompact, UpdateProcessor up)
        throws IOException, InterruptedException, ClassNotFoundException {

    System.out.println("Table contents BEFORE processing with MR job:");
    System.out.println(DebugUtil.getContentAsText(hTable));

    configuration.set("hut.mr.buffer.size", String.valueOf(mapBufferSize));
    configuration.set("hut.mr.buffer.size.bytes", String.valueOf(mapBufferSizeInBytes));
    configuration.set("hut.processor.minRecordsToCompact", String.valueOf(minRecordsToCompact));
    Job job = new Job(configuration);
    UpdatesProcessingMrJob.initJob(Bytes.toString(hTable.getTableName()), new Scan(), up, job);

    boolean success = job.waitForCompletion(true);
    Assert.assertTrue(success);/*from w w  w  .  ja  va 2 s.  c  o  m*/

    System.out.println("Table contents AFTER processing with MR job:");
    System.out.println(DebugUtil.getContentAsText(hTable));
}

From source file:com.sematext.hbase.hut.TestHBaseHut.java

License:Apache License

@Test
public void testPartialUpdatesProcessingMrJob()
        throws IOException, InterruptedException, ClassNotFoundException {
    try {//from   ww  w. jav  a 2  s  . c  o  m
        // Writing data
        for (int i = 0; i < 15; i++) {
            byte[] company;
            if (i % 2 == 0) {
                company = FORD;
            } else {
                company = CHRYSLER;
            }

            recordSale(hTable, company, i);

            Thread.sleep(200);
        }

        recordSale(hTable, TOYOTA, 23);

        System.out.println(DebugUtil.getContentAsText(hTable));

        Configuration configuration = testingUtility.getConfiguration();
        configuration.set("hut.mr.buffer.size", String.valueOf(10));
        configuration.set("hut.processor.tsMod", String.valueOf(300));
        Job job = new Job(configuration);
        UpdatesProcessingMrJob.initJob(TABLE_NAME, new Scan(), new StockSaleUpdateProcessor(), job);

        boolean success = job.waitForCompletion(true);
        Assert.assertTrue(success);

        // TODO: add code verification of proper partial compaction instead of manually observing in output
        System.out.println(DebugUtil.getContentAsText(hTable));

        StockSaleUpdateProcessor updateProcessor = new StockSaleUpdateProcessor();
        verifyLastSalesWithCompation(hTable, updateProcessor, FORD, new int[] { 14, 12, 10, 8, 6 });
        verifyLastSalesWithCompation(hTable, updateProcessor, CHRYSLER, new int[] { 13, 11, 9, 7, 5 });
        verifyLastSalesWithCompation(hTable, updateProcessor, TOYOTA, new int[] { 23 });

    } finally { // TODO: do we really need try/finally block here?
        testingUtility.shutdownMiniMapReduceCluster();
    }
}

From source file:com.sequenceiq.yarntest.mr.QuasiMonteCarlo.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi// w  ww .j  av a  2s . c  om
 */
public static JobID submitPiEstimationMRApp(String jobName, int numMaps, long numPoints, Path tmpDir,
        Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(conf);
    //setup job conf
    job.setJobName(jobName);
    job.setJarByClass(QuasiMonteCarlo.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(LongWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(QmcMapper.class);

    job.setReducerClass(QmcReducer.class);
    job.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    job.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(tmpDir, "in");
    final Path outDir = new Path(tmpDir, "out");
    FileInputFormat.setInputPaths(job, inDir);
    FileOutputFormat.setOutputPath(job, outDir);

    final FileSystem fs = FileSystem.get(conf);
    if (fs.exists(tmpDir)) {
        fs.delete(tmpDir, true);
        //      throw new IOException("Tmp directory " + fs.makeQualified(tmpDir)
        //          + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    //  try {
    //generate an input file for each map task
    for (int i = 0; i < numMaps; ++i) {
        final Path file = new Path(inDir, "part" + i);
        final LongWritable offset = new LongWritable(i * numPoints);
        final LongWritable size = new LongWritable(numPoints);
        final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class,
                LongWritable.class, CompressionType.NONE);
        try {
            writer.append(offset, size);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i);
    }

    //start a map/reduce job
    System.out.println("Starting Job");
    final long startTime = System.currentTimeMillis();
    job.submit();
    //      final double duration = (System.currentTimeMillis() - startTime)/1000.0;
    //      System.out.println("Job Finished in " + duration + " seconds");
    return job.getJobID();

    //    } finally {
    //      fs.delete(tmpDir, true);
    //    }
}

From source file:com.shmsoft.dmass.main.MRFreeEedProcess.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // inventory dir holds all package (zip) files resulting from stage
    String projectFileName = args[0];
    String outputPath = args[1];//from  ww  w .j  a v  a 2s  . c o m
    logger.info("Running Hadoop job");
    logger.info("Input project file = " + projectFileName);
    logger.info("Output path = " + outputPath);

    // Hadoop configuration class
    Configuration configuration = getConf();
    // No speculative execution! Do not process the same file twice
    configuration.set("mapred.reduce.tasks.speculative.execution", "false");
    // TODO even in local mode, the first argument should not be the inventory
    // but write a complete project file instead
    Project project = Project.getProject();
    if (project == null || project.isEmpty()) {
        // configure Hadoop input files
        System.out.println("Reading project file " + projectFileName);
        project = new Project().loadFromFile(new File(projectFileName));
        Project.setProject(project);
    }
    project.setProperty(ParameterProcessing.OUTPUT_DIR_HADOOP, outputPath);
    // send complete project information to all mappers and reducers
    configuration.set(ParameterProcessing.PROJECT, project.toString());

    Settings.load();
    configuration.set(ParameterProcessing.SETTINGS_STR, Settings.getSettings().toString());
    configuration.set(ParameterProcessing.METADATA_FILE,
            Files.toString(new File(ColumnMetadata.metadataNamesFile), Charset.defaultCharset()));
    Job job = new Job(configuration);
    job.setJarByClass(MRFreeEedProcess.class);
    job.setJobName("MRFreeEedProcess");

    // Hadoop processes key-value pairs
    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(MapWritable.class);

    // set map and reduce classes
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    // Hadoop TextInputFormat class
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    //        String delim = "\u0001";
    //        configuration.set("mapred.textoutputformat.separator", delim);
    //        configuration.set("mapreduce.output.textoutputformat.separator", delim);

    logger.debug("project.isEnvHadoop() = {} ", project.isEnvHadoop());
    String inputPath = projectFileName;
    if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) {
        inputPath = formInputPath(project);
    }

    logger.debug("Ready to run, inputPath = {}, outputPath = {}", inputPath, outputPath);
    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    SHMcloudLogging.init(false);

    if (Settings.getSettings().isHadoopDebug()) {
        if (new File(outputPath).exists()) {
            Util.deleteDirectory(new File(outputPath));
        }
    }

    SolrIndex.getInstance().init();

    boolean success = job.waitForCompletion(true);
    if (project.isEnvHadoop() && project.isFsS3()) {
        transferResultsToS3(outputPath);
    }

    SolrIndex.getInstance().destroy();

    return success ? 0 : 1;
}

From source file:com.shopzilla.hadoop.mapreduce.MiniMRClusterContextMRTest.java

License:Apache License

@Test
public void testWordCount() throws Exception {
    Path input = new Path("/user/test/keywords_data");
    Path output = new Path("/user/test/word_count");

    Job job = new Job(configuration);

    job.setJobName("Word Count Test");

    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(SumReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setNumReduceTasks(1);/*from w  w w. j  a v a 2 s.  c o m*/
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    assertTrue("All files from /data classpath directory should have been copied into HDFS",
            miniMRClusterContext.getFileSystem().exists(input));

    job.waitForCompletion(true);

    assertTrue("Output file should have been created", miniMRClusterContext.getFileSystem().exists(output));

    final LinkedList<String> expectedLines = new LinkedList<String>();
    expectedLines.add("goodbye\t1");
    expectedLines.add("hello\t1");
    expectedLines.add("world\t2");

    miniMRClusterContext.processData(output, new Function<String, Void>() {
        @Override
        public Void apply(String line) {
            assertEquals(expectedLines.pop(), line);
            return null;
        }
    });
    assertEquals(0, expectedLines.size());
}

From source file:com.splout.db.benchmark.IdentityJob.java

License:Apache License

@Override
public int run(String[] params) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Identity Job");
    try {/*from   w w  w .j  ava 2  s  .c om*/
        jComm.parse(params);
    } catch (ParameterException e) {
        System.err.println(e.getMessage());
        jComm.usage();
        System.exit(-1);
    }

    Path outP = new Path(outputPath);
    HadoopUtils.deleteIfExists(FileSystem.get(conf), outP);

    if (pangoolSchema == null) {
        // Use plain Hadoop API
        Job job = new Job(conf);
        job.setInputFormatClass(TextInputFormat.class);
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outP);

        job.waitForCompletion(true);

    } else {
        if (groupBy == null) {
            System.err.println("If pangoolSchema is used, groupBy must also be used.");
            jComm.usage();
            System.exit(-1);
        }

        Schema schema = new Schema("sch", Fields.parse(pangoolSchema));
        Path inputP = new Path(inputPath);

        // Use Pangool API - parse CSV, etc
        TupleMRBuilder builder = new TupleMRBuilder(conf);
        TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false,
                separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null);
        TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0),
                quotes.charAt(0), escape.charAt(0));

        builder.addIntermediateSchema(schema);
        builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper());
        builder.setGroupByFields(groupBy);
        builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class);
        builder.setTupleReducer(new IdentityTupleReducer());
        builder.setJarByClass(this.getClass());

        builder.createJob().waitForCompletion(true);
    }

    return 1;
}

From source file:com.splout.db.hadoop.SchemaSampler.java

License:Apache License

public static Schema sample(Configuration conf, Path input, InputFormat<ITuple, NullWritable> inputFormat)
        throws IOException, InterruptedException {
    Schema schema = null;//from   www . j a va2 s.co m

    // sample schema from input path given the provided InputFormat
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    FileInputFormat.setInputPaths(job, input);
    // get first inputSplit
    List<InputSplit> inputSplits = inputFormat.getSplits(job);
    if (inputSplits == null || inputSplits.size() == 0) {
        throw new IOException(
                "Given input format doesn't produce any input split. Can't sample first record. PATH: "
                        + input);
    }
    InputSplit inputSplit = inputSplits.get(0);
    TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);
    TaskAttemptContext attemptContext;
    try {
        attemptContext = TaskAttemptContextFactory.get(conf, attemptId);
    } catch (Exception e) {
        throw new IOException(e);
    }

    RecordReader<ITuple, NullWritable> rReader = inputFormat.createRecordReader(inputSplit, attemptContext);
    rReader.initialize(inputSplit, attemptContext);

    if (!rReader.nextKeyValue()) {
        throw new IOException(
                "Can't read first record of first input split of the given path [" + input + "].");
    }

    // finally get the sample schema
    schema = rReader.getCurrentKey().getSchema();
    log.info("Sampled schema from [" + input + "] : " + schema);
    rReader.close();

    return schema;
}

From source file:com.splout.db.hadoop.TupleSampler.java

License:Apache License

public long sample(TablespaceSpec tablespace, Configuration hadoopConf, long sampleSize, Path outFile)
        throws TupleSamplerException {
    // 1 - Determine Input Splits
    // 2 - Launch sampling with the selected method
    // 3 - Recovering results
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat = new HashMap<InputSplit, InputFormat<ITuple, NullWritable>>();
    Map<InputSplit, RecordProcessor> recordProcessorPerSplit = new HashMap<InputSplit, RecordProcessor>();
    Map<InputSplit, Map<String, String>> specificHadoopConfMap = new HashMap<InputSplit, Map<String, String>>();
    Map<InputSplit, TableSpec> splitToTableSpec = new HashMap<InputSplit, TableSpec>();
    Map<InputSplit, JavascriptEngine> splitToJsEngine = new HashMap<InputSplit, JavascriptEngine>();

    try {/*from   w  w w.j  a  v a 2  s  .  c  o  m*/
        for (Table table : tablespace.getPartitionedTables()) {

            // Initialize JavaScript engine if needed
            JavascriptEngine jsEngine = null;
            TableSpec tableSpec = table.getTableSpec();
            if (tableSpec.getPartitionByJavaScript() != null) {
                try {
                    jsEngine = new JavascriptEngine(tableSpec.getPartitionByJavaScript());
                } catch (Throwable e) {
                    throw new RuntimeException(e);
                }
            }

            for (TableInput tableFile : table.getFiles()) {
                @SuppressWarnings("deprecation")
                Job job = new Job(hadoopConf);
                FileInputFormat.setInputPaths(job, tableFile.getPaths());
                if (options.getMaxInputSplitSize() != null) {
                    logger.info("Using max input split size: " + options.getMaxInputSplitSize());
                    FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize());
                }
                job.setInputFormatClass(FileInputFormat.class);

                if (tableFile.getSpecificHadoopInputFormatContext() != null) {
                    for (Map.Entry<String, String> specificHadoopConf : tableFile
                            .getSpecificHadoopInputFormatContext().entrySet()) {
                        job.getConfiguration().set(specificHadoopConf.getKey(), specificHadoopConf.getValue());
                    }
                }

                for (InputSplit split : tableFile.getFormat().getSplits(job)) {
                    if (tableFile.getSpecificHadoopInputFormatContext() != null) {
                        specificHadoopConfMap.put(split, tableFile.getSpecificHadoopInputFormatContext());
                    }
                    splitToFormat.put(split, tableFile.getFormat());
                    recordProcessorPerSplit.put(split, tableFile.getRecordProcessor());
                    splitToTableSpec.put(split, tableSpec);
                    splitToJsEngine.put(split, jsEngine);
                    splits.add(split);
                }
            }
        }

        long retrievedSamples;
        if (samplingType.equals(SamplingType.RANDOM)) {
            try {
                RandomSamplingOptions defOptions = (RandomSamplingOptions) options;
                // Default sampling method
                retrievedSamples = randomSampling(sampleSize, hadoopConf, outFile, splits, splitToTableSpec,
                        splitToFormat, specificHadoopConfMap, recordProcessorPerSplit, splitToJsEngine,
                        defOptions.getMaxSplitsToVisit());
            } catch (ClassCastException ef) {
                throw new RuntimeException("Invalid options class: " + options.getClass() + " Expected:"
                        + RandomSamplingOptions.class);
            }
        } else {
            // Reservoir sampling over full data
            retrievedSamples = fullScanSampling(tablespace, sampleSize, hadoopConf, outFile, splits.size());
        }
        return retrievedSamples;
    } catch (IOException e) {
        throw new TupleSamplerException(e);
    } catch (InterruptedException e) {
        throw new TupleSamplerException(e);
    }
}

From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }/*from w ww . j  a  v a2 s .  com*/

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    FileOutputFormat.setCompressOutput(job, true);

    job.setInputFormatClass(NQuadsInputFormat.class);
    job.setMapperClass(CollationMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(QuadWritable.class);

    job.setReducerClass(CollationReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : 1;
}