Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.rockstor.compact.GenGarbageIndexTool.java

License:Apache License

private Job createSubmittableJob(Configuration conf) throws IOException {
    Job job = new Job(conf, NAME);

    job.setJarByClass(GenGarbageIndexTool.class);
    Scan scan = new Scan();
    TableMapReduceUtil.initTableMapperJob(GarbageChunkDB.TAB_NAME, scan, GarbageChunkMapper.class,
            ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

    TableMapReduceUtil.setScannerCaching(job, batchSize);
    job.setReducerClass(GarbageChunkReduce.class);
    job.setPartitionerClass(GarbageChunkPartition.class);
    job.setCombinerClass(GarbageChunkCombine.class);

    job.setNumReduceTasks(Compactor.getInstance().getReduceNum());
    job.setOutputFormatClass(NullOutputFormat.class);

    LOG.info("init job " + NAME + " finished!");
    return job;/*from  w  ww  . j av  a2s.  co m*/
}

From source file:com.rockstor.compact.RecoveryTool.java

License:Apache License

private Job createSubmittableJob(Configuration conf) throws IOException {
    Job job = new Job(conf, NAME);
    job.setJarByClass(RecoveryTool.class);

    job.setInputFormatClass(CompactDirInputFormat.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setMapOutputKeyClass(NullWritable.class);

    job.setMapperClass(RecoveryMapper.class);

    job.setNumReduceTasks(0);//from  w w  w. j  a v a2s .  c  o  m

    job.setOutputFormatClass(NullOutputFormat.class);
    LOG.info("init job " + NAME + " OK!");
    return job;
}

From source file:com.sa.npopa.samples.hbase.myMR.java

License:Apache License

public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];/*from  www.  j  a va 2  s .c  o m*/

    Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
    job.setJarByClass(myMR.class);
    Scan scan = new Scan();
    scan.setCacheBlocks(false);
    scan.setBatch(10);

    //scan.setFilter(new FirstKeyOnlyFilter()); //need to find another filter like key only.
    scan.setFilter(new KeyOnlyFilter());

    job.setOutputFormatClass(NullOutputFormat.class);
    TableMapReduceUtil.initTableMapperJob(tableName, scan, RowCounterMapper.class, ImmutableBytesWritable.class,
            Result.class, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.sa.npopa.samples.hbase.RowCounter.java

License:Apache License

/**
 * Sets up the actual job.//from w w w  . ja  va2  s  . c om
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    String startKey = null;
    String endKey = null;
    long startTime = 0;
    long endTime = 0;

    StringBuilder sb = new StringBuilder();

    final String rangeSwitch = "--range=";
    final String startTimeArgKey = "--starttime=";
    final String endTimeArgKey = "--endtime=";
    final String expectedCountArg = "--expected-count=";

    // First argument is table name, starting from second
    for (int i = 1; i < args.length; i++) {
        if (args[i].startsWith(rangeSwitch)) {
            String[] startEnd = args[i].substring(rangeSwitch.length()).split(",", 2);
            if (startEnd.length != 2 || startEnd[1].contains(",")) {
                printUsage("Please specify range in such format as \"--range=a,b\" "
                        + "or, with only one boundary, \"--range=,b\" or \"--range=a,\"");
                return null;
            }
            startKey = startEnd[0];
            endKey = startEnd[1];
            continue;
        }
        if (args[i].startsWith(startTimeArgKey)) {
            startTime = Long.parseLong(args[i].substring(startTimeArgKey.length()));
            continue;
        }
        if (args[i].startsWith(endTimeArgKey)) {
            endTime = Long.parseLong(args[i].substring(endTimeArgKey.length()));
            continue;
        }
        if (args[i].startsWith(expectedCountArg)) {
            conf.setLong(EXPECTED_COUNT_KEY, Long.parseLong(args[i].substring(expectedCountArg.length())));
            continue;
        }
        // if no switch, assume column names
        sb.append(args[i]);
        sb.append(" ");
    }
    if (endTime < startTime) {
        printUsage("--endtime=" + endTime + " needs to be greater than --starttime=" + startTime);
        return null;
    }

    Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
    job.setJarByClass(RowCounter.class);
    Scan scan = new Scan();
    scan.setCacheBlocks(false);
    if (startKey != null && !startKey.equals("")) {
        scan.setStartRow(Bytes.toBytes(startKey));
    }
    if (endKey != null && !endKey.equals("")) {
        scan.setStopRow(Bytes.toBytes(endKey));
    }
    if (sb.length() > 0) {
        for (String columnName : sb.toString().trim().split(" ")) {
            String family = StringUtils.substringBefore(columnName, ":");
            String qualifier = StringUtils.substringAfter(columnName, ":");

            if (StringUtils.isBlank(qualifier)) {
                scan.addFamily(Bytes.toBytes(family));
            } else {
                scan.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier));
            }
        }
    }
    scan.setFilter(new FirstKeyOnlyFilter());
    scan.setTimeRange(startTime, endTime == 0 ? HConstants.LATEST_TIMESTAMP : endTime);
    job.setOutputFormatClass(NullOutputFormat.class);
    TableMapReduceUtil.initTableMapperJob(tableName, scan, RowCounterMapper.class, ImmutableBytesWritable.class,
            Result.class, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.savy3.nonequijoin.MapOutputSampler.java

License:Apache License

/**
 * Driver for InputSampler MapReduce Job
 *//*from   www.java 2 s. c o m*/
public static void runMap(Job job, Path sampleInputPath)
        throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException {
    LOG.info("Running a MapReduce Job on Sample Input File" + sampleInputPath.toString());

    Configuration conf = new Configuration();
    conf.setBoolean("mapreduce.job.ubertask.enable", true);
    conf.set("numSamples", "" + (job.getNumReduceTasks() - 1));
    Job sampleJob = new Job(conf);
    sampleJob.setMapperClass(job.getMapperClass());
    sampleJob.setReducerClass(SampleKeyReducer.class);
    sampleJob.setJarByClass(job.getMapperClass());
    sampleJob.setMapOutputKeyClass(job.getMapOutputKeyClass());
    sampleJob.setMapOutputValueClass(job.getMapOutputValueClass());
    sampleJob.setOutputKeyClass(job.getMapOutputKeyClass());
    sampleJob.setOutputValueClass(NullWritable.class);
    sampleJob.setInputFormatClass(SequenceFileInputFormat.class);
    sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileInputFormat.addInputPath(sampleJob, sampleInputPath);
    FileSystem fs = FileSystem.get(conf);

    Path out = new Path(sampleInputPath.getParent(), "mapOut");
    fs.delete(out, true);

    SequenceFileOutputFormat.setOutputPath(sampleJob, out);

    sampleJob.waitForCompletion(true);

    LOG.info("Sample MapReduce Job Output File" + out.toString());

    Path partFile = new Path(out, "part-r-00000");
    Path tmpFile = new Path("/_tmp");
    fs.delete(tmpFile, true);
    fs.rename(partFile, tmpFile);
    fs.delete(sampleInputPath.getParent(), true);
    fs.rename(new Path("/_tmp"), sampleInputPath.getParent());

    LOG.info("Sample partitioning file cpied to location " + sampleInputPath.getParent().toString());
}

From source file:com.scaleoutsoftware.soss.hserver.examples.NamedMapWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
        System.err.println("Usage: wordcount <input map> <output map> <threshold>");
        System.exit(2);//from   w  w  w.ja  va 2  s .co m
    }

    final int threshold = new Integer(otherArgs[2]);

    NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap(otherArgs[0],
            new WritableSerializer<IntWritable>(IntWritable.class), new WritableSerializer<Text>(Text.class));

    NamedMap<Text, IntWritable> outputMap = NamedMapFactory.getMap(otherArgs[1],
            new WritableSerializer<Text>(Text.class), new WritableSerializer<IntWritable>(IntWritable.class));

    //Create the invocation grid
    InvocationGrid grid = HServerJob.getInvocationGridBuilder("WordCountIG").addJar("wordcount.jar").load();

    //Create hServer job
    Job job = new HServerJob(conf, "word count", false, grid);
    job.setJarByClass(NamedMapWordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(NamedMapInputFormat.class);
    job.setOutputFormatClass(GridOutputFormat.class);

    //Set named maps as input and output
    NamedMapInputFormat.setNamedMap(job, inputMap);
    GridOutputFormat.setNamedMap(job, outputMap);

    //Execute job
    job.waitForCompletion(true);

    //Assign invocation grid to the map, so parallel operation can be performed
    outputMap.setInvocationGrid(grid);

    //Run query to find words that are used more than threshold frequency
    Iterable<Text> words = outputMap.executeParallelQuery(new UsageFrequencyCondition(threshold));

    //Unload the invocation grid
    grid.unload();

    //Output resulting words and their frequencies
    System.out.println("Following words were used more than " + threshold + " times:");
    for (Text word : words) {
        System.out.println("\"" + word.toString() + "\" was used " + outputMap.get(word) + " times.");
    }
}

From source file:com.sematext.hbase.hut.UpdatesProcessingMrJob.java

License:Apache License

/**
 * Use this before submitting a TableMap job. It will appropriately set up
 * the job.//from   w  ww  .  j  av  a 2s.  com
 *
 * @param table  The table name.
 * @param scan  The scan with the columns to scan.
 * @param up update processor implementation
 * @param job  The job configuration.
 * @throws java.io.IOException When setting up the job fails.
 */
@SuppressWarnings("unchecked")
public static void initJob(String table, Scan scan, UpdateProcessor up, Job job) throws IOException {
    TableMapReduceUtil.initTableMapperJob(table, scan, UpdatesProcessingMapper.class, null, null, job);
    job.setJarByClass(UpdatesProcessingMrJob.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);

    job.getConfiguration().set(UpdatesProcessingMapper.HTABLE_NAME_ATTR, table);
    job.getConfiguration().set(UpdatesProcessingMapper.HUT_PROCESSOR_CLASS_ATTR, up.getClass().getName());
    job.getConfiguration().set(UpdatesProcessingMapper.HUT_PROCESSOR_DETAILS_ATTR,
            convertUpdateProcessorToString(up));

    job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false"); // TODO: explain
}

From source file:com.sematext.hbase.wd.RowKeyDistributorTestBase.java

License:Apache License

private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue,
        int seekIntervalMinValue, int seekIntervalMaxValue)
        throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue,
            seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = testingUtility.getConfiguration();
    Job job = new Job(conf, "testMapReduceInternal()-Job");
    job.setJarByClass(this.getClass());
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);/*from ww w  .  ja va 2  s.c om*/

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);
}

From source file:com.sequenceiq.yarntest.mr.QuasiMonteCarlo.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi/*from  ww w .j  a  v a2 s. com*/
 */
public static JobID submitPiEstimationMRApp(String jobName, int numMaps, long numPoints, Path tmpDir,
        Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(conf);
    //setup job conf
    job.setJobName(jobName);
    job.setJarByClass(QuasiMonteCarlo.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(LongWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(QmcMapper.class);

    job.setReducerClass(QmcReducer.class);
    job.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    job.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(tmpDir, "in");
    final Path outDir = new Path(tmpDir, "out");
    FileInputFormat.setInputPaths(job, inDir);
    FileOutputFormat.setOutputPath(job, outDir);

    final FileSystem fs = FileSystem.get(conf);
    if (fs.exists(tmpDir)) {
        fs.delete(tmpDir, true);
        //      throw new IOException("Tmp directory " + fs.makeQualified(tmpDir)
        //          + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    //  try {
    //generate an input file for each map task
    for (int i = 0; i < numMaps; ++i) {
        final Path file = new Path(inDir, "part" + i);
        final LongWritable offset = new LongWritable(i * numPoints);
        final LongWritable size = new LongWritable(numPoints);
        final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class,
                LongWritable.class, CompressionType.NONE);
        try {
            writer.append(offset, size);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i);
    }

    //start a map/reduce job
    System.out.println("Starting Job");
    final long startTime = System.currentTimeMillis();
    job.submit();
    //      final double duration = (System.currentTimeMillis() - startTime)/1000.0;
    //      System.out.println("Job Finished in " + duration + " seconds");
    return job.getJobID();

    //    } finally {
    //      fs.delete(tmpDir, true);
    //    }
}

From source file:com.shmsoft.dmass.main.MRFreeEedProcess.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // inventory dir holds all package (zip) files resulting from stage
    String projectFileName = args[0];
    String outputPath = args[1];/*from   w  ww  .  j  av  a 2 s. c  o  m*/
    logger.info("Running Hadoop job");
    logger.info("Input project file = " + projectFileName);
    logger.info("Output path = " + outputPath);

    // Hadoop configuration class
    Configuration configuration = getConf();
    // No speculative execution! Do not process the same file twice
    configuration.set("mapred.reduce.tasks.speculative.execution", "false");
    // TODO even in local mode, the first argument should not be the inventory
    // but write a complete project file instead
    Project project = Project.getProject();
    if (project == null || project.isEmpty()) {
        // configure Hadoop input files
        System.out.println("Reading project file " + projectFileName);
        project = new Project().loadFromFile(new File(projectFileName));
        Project.setProject(project);
    }
    project.setProperty(ParameterProcessing.OUTPUT_DIR_HADOOP, outputPath);
    // send complete project information to all mappers and reducers
    configuration.set(ParameterProcessing.PROJECT, project.toString());

    Settings.load();
    configuration.set(ParameterProcessing.SETTINGS_STR, Settings.getSettings().toString());
    configuration.set(ParameterProcessing.METADATA_FILE,
            Files.toString(new File(ColumnMetadata.metadataNamesFile), Charset.defaultCharset()));
    Job job = new Job(configuration);
    job.setJarByClass(MRFreeEedProcess.class);
    job.setJobName("MRFreeEedProcess");

    // Hadoop processes key-value pairs
    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(MapWritable.class);

    // set map and reduce classes
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    // Hadoop TextInputFormat class
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    //        String delim = "\u0001";
    //        configuration.set("mapred.textoutputformat.separator", delim);
    //        configuration.set("mapreduce.output.textoutputformat.separator", delim);

    logger.debug("project.isEnvHadoop() = {} ", project.isEnvHadoop());
    String inputPath = projectFileName;
    if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) {
        inputPath = formInputPath(project);
    }

    logger.debug("Ready to run, inputPath = {}, outputPath = {}", inputPath, outputPath);
    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    SHMcloudLogging.init(false);

    if (Settings.getSettings().isHadoopDebug()) {
        if (new File(outputPath).exists()) {
            Util.deleteDirectory(new File(outputPath));
        }
    }

    SolrIndex.getInstance().init();

    boolean success = job.waitForCompletion(true);
    if (project.isEnvHadoop() && project.isFsS3()) {
        transferResultsToS3(outputPath);
    }

    SolrIndex.getInstance().destroy();

    return success ? 0 : 1;
}