Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:nl.utwente.trafficanalyzer.SensorCountPerRoadPerDay.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(SensorCountPerRoadPerDay.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {//from w  w w . j  ava  2  s .c om
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);
    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:org.acacia.csr.java.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*/*from   w w w .j  av a 2 s .  com*/
    String dir1 = "/user/miyuru/wcout";
     //We first delete the temporary directories if they exist on the HDFS
      FileSystem fs1 = FileSystem.get(new JobConf());
              
     if(fs1.exists(new Path(dir1))){
        fs1.delete(new Path(dir1), true);
     }
            
    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);
            
    job.setSortComparatorClass(SortComparator.class);
    FileInputFormat.addInputPath(job, new Path("/user/miyuru/input"));
    FileOutputFormat.setOutputPath(job, new Path(dir1));
    job.waitForCompletion(true); 
    */

    String dir3 = "/user/miyuru/wcout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs3 = FileSystem.get(new JobConf());

    if (fs3.exists(new Path(dir3))) {
        fs3.delete(new Path(dir3), true);
    }

    JobConf conf3 = new JobConf();
    conf3.setNumMapTasks(96);
    FileInputFormat.addInputPath(conf3, new Path(dir5));
    FileOutputFormat.setOutputPath(conf3, new Path(dir3));
    Job job3 = new Job(conf3, "word count");
    job3.setJarByClass(WordCount.class);
    job3.setMapperClass(TokenizerMapper.class);
    job3.setCombinerClass(IntSumReducer.class);
    job3.setReducerClass(IntSumReducer.class);
    job3.setOutputKeyClass(LongWritable.class);
    job3.setOutputValueClass(LongWritable.class);

    job3.setSortComparatorClass(SortComparator.class);

    job3.waitForCompletion(true);

    PrintWriter writer;
    try {
        writer = new PrintWriter("/tmp/wfile", "UTF-8");
        writer.println("");
        writer.flush();
        writer.close();
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (UnsupportedEncodingException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    System.out.println("------Done Word Count---------------");

}

From source file:org.acacia.csr.java.ZeroVertexSearcher.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*//from   ww w .  j  a  va  2s . c  om
    String dir1 = "/user/miyuru/wcout";
     //We first delete the temporary directories if they exist on the HDFS
      FileSystem fs1 = FileSystem.get(new JobConf());
              
     if(fs1.exists(new Path(dir1))){
        fs1.delete(new Path(dir1), true);
     }
            
    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);
            
    job.setSortComparatorClass(SortComparator.class);
    FileInputFormat.addInputPath(job, new Path("/user/miyuru/input"));
    FileOutputFormat.setOutputPath(job, new Path(dir1));
    job.waitForCompletion(true); 
    */

    String dir3 = "/user/miyuru/zout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs3 = FileSystem.get(new JobConf());

    if (fs3.exists(new Path(dir3))) {
        fs3.delete(new Path(dir3), true);
    }

    JobConf conf3 = new JobConf();
    conf3.setNumMapTasks(96);
    FileInputFormat.addInputPath(conf3, new Path(dir5));
    FileOutputFormat.setOutputPath(conf3, new Path(dir3));
    conf3.set("mapred.map.max.attempts", "0");//If the job fails we assume that it happens because we found zero. Therfore we do not attempt again.
    Job job3 = new Job(conf3, "zero_vertex_search");
    job3.setJarByClass(ZeroVertexSearcher.class);
    job3.setMapperClass(TokenizerMapper.class);
    job3.setCombinerClass(IntSumReducer.class);
    job3.setReducerClass(IntSumReducer.class);
    job3.setOutputKeyClass(LongWritable.class);
    job3.setOutputValueClass(LongWritable.class);
    job3.setNumReduceTasks(0);

    job3.setSortComparatorClass(SortComparator.class);
    try {
        job3.waitForCompletion(true);
    } catch (org.acacia.csr.java.ZeroFoundException ex) {
        System.out.println("Found Zero vertex");
        job3.killJob();
    }
    System.out.println("------Done Zero Vertex search---------------");

}

From source file:org.apache.accumulo.examples.mapreduce.UniqueColumns.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Opts opts = new Opts();
    opts.parseArgs(UniqueColumns.class.getName(), args);

    String jobName = this.getClass().getSimpleName() + "_" + System.currentTimeMillis();

    Job job = Job.getInstance(getConf());
    job.setJobName(jobName);/*from   w  ww.  ja va  2  s.  c  om*/
    job.setJarByClass(this.getClass());

    String clone = opts.getTableName();
    Connector conn = null;

    opts.setAccumuloConfigs(job);

    if (opts.offline) {
        /*
         * this example clones the table and takes it offline. If you plan to run map reduce jobs over a table many times, it may be more efficient to compact the
         * table, clone it, and then keep using the same clone as input for map reduce.
         */

        conn = opts.getConnector();
        clone = opts.getTableName() + "_" + jobName;
        conn.tableOperations().clone(opts.getTableName(), clone, true, new HashMap<String, String>(),
                new HashSet<String>());
        conn.tableOperations().offline(clone);

        AccumuloInputFormat.setOfflineTableScan(job, true);
        AccumuloInputFormat.setInputTableName(job, clone);
    }

    job.setInputFormatClass(AccumuloInputFormat.class);

    job.setMapperClass(UMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(UReducer.class);
    job.setReducerClass(UReducer.class);

    job.setNumReduceTasks(opts.reducers);

    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(opts.output));

    job.waitForCompletion(true);

    if (opts.offline) {
        conn.tableOperations().delete(clone);
    }

    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.accumulo.examples.simple.mapreduce.UniqueColumns.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Opts opts = new Opts();
    opts.parseArgs(UniqueColumns.class.getName(), args);

    String jobName = this.getClass().getSimpleName() + "_" + System.currentTimeMillis();

    Job job = JobUtil.getJob(getConf());
    job.setJobName(jobName);/*www .  ja v  a2 s .c  o m*/
    job.setJarByClass(this.getClass());

    String clone = opts.getTableName();
    Connector conn = null;

    opts.setAccumuloConfigs(job);

    if (opts.offline) {
        /*
         * this example clones the table and takes it offline. If you plan to run map reduce jobs over a table many times, it may be more efficient to compact the
         * table, clone it, and then keep using the same clone as input for map reduce.
         */

        conn = opts.getConnector();
        clone = opts.getTableName() + "_" + jobName;
        conn.tableOperations().clone(opts.getTableName(), clone, true, new HashMap<String, String>(),
                new HashSet<String>());
        conn.tableOperations().offline(clone);

        AccumuloInputFormat.setOfflineTableScan(job, true);
        AccumuloInputFormat.setInputTableName(job, clone);
    }

    job.setInputFormatClass(AccumuloInputFormat.class);

    job.setMapperClass(UMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(UReducer.class);
    job.setReducerClass(UReducer.class);

    job.setNumReduceTasks(opts.reducers);

    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(opts.output));

    job.waitForCompletion(true);

    if (opts.offline) {
        conn.tableOperations().delete(clone);
    }

    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.airavata.gfac.hadoop.provider.impl.HadoopProvider.java

License:Apache License

public void execute(JobExecutionContext jobExecutionContext) throws GFacProviderException {
    HadoopApplicationDeploymentDescriptionType hadoopAppDesc = (HadoopApplicationDeploymentDescriptionType) jobExecutionContext
            .getApplicationContext().getApplicationDeploymentDescription().getType();
    MessageContext inMessageContext = jobExecutionContext.getInMessageContext();
    HadoopApplicationDeploymentDescriptionType.HadoopJobConfiguration jobConf = hadoopAppDesc
            .getHadoopJobConfiguration();

    try {/*from  w  w w . j a  v a 2 s .  c  om*/
        // Preparing Hadoop configuration
        Configuration hadoopConf = HadoopUtils.createHadoopConfiguration(jobExecutionContext,
                isWhirrBasedDeployment, hadoopConfigDir);

        // Load jar containing map-reduce job implementation
        ArrayList<URL> mapRedJars = new ArrayList<URL>();
        mapRedJars.add(new File(jobConf.getJarLocation()).toURL());
        URLClassLoader childClassLoader = new URLClassLoader(mapRedJars.toArray(new URL[mapRedJars.size()]),
                this.getClass().getClassLoader());

        Job job = new Job(hadoopConf);

        job.setJobName(jobConf.getJobName());

        job.setOutputKeyClass(Class.forName(jobConf.getOutputKeyClass(), true, childClassLoader));
        job.setOutputValueClass(Class.forName(jobConf.getOutputValueClass(), true, childClassLoader));

        job.setMapperClass(
                (Class<? extends Mapper>) Class.forName(jobConf.getMapperClass(), true, childClassLoader));
        job.setCombinerClass(
                (Class<? extends Reducer>) Class.forName(jobConf.getCombinerClass(), true, childClassLoader));
        job.setReducerClass(
                (Class<? extends Reducer>) Class.forName(jobConf.getCombinerClass(), true, childClassLoader));

        job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(jobConf.getInputFormatClass(),
                true, childClassLoader));
        job.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(jobConf.getOutputFormatClass(),
                true, childClassLoader));

        FileInputFormat.setInputPaths(job, new Path(hadoopAppDesc.getInputDataDirectory()));
        FileOutputFormat.setOutputPath(job, new Path(hadoopAppDesc.getOutputDataDirectory()));

        job.waitForCompletion(true);
        System.out.println(job.getTrackingURL());
        if (jobExecutionContext.getOutMessageContext() == null) {
            jobExecutionContext.setOutMessageContext(new MessageContext());
        }

        OutputParameterType[] outputParametersArray = jobExecutionContext.getApplicationContext()
                .getServiceDescription().getType().getOutputParametersArray();
        for (OutputParameterType outparamType : outputParametersArray) {
            String paramName = outparamType.getParameterName();
            if (paramName.equals("test-hadoop")) {
                ActualParameter outParam = new ActualParameter();
                outParam.getType().changeType(StringParameterType.type);
                ((StringParameterType) outParam.getType()).setValue(job.getTrackingURL());
                jobExecutionContext.getOutMessageContext().addParameter("test-hadoop", outParam);
            }
        }
    } catch (Exception e) {
        String errMessage = "Error occurred during Map-Reduce job execution.";
        logger.error(errMessage, e);
        throw new GFacProviderException(errMessage, e);
    }
}

From source file:org.apache.cassandra.example.hadoop.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    String outputReducerType = "filesystem";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];//ww  w  . ja va  2  s  .com
    }
    logger.info("output reducer type: " + outputReducerType);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "text" + i;
        getConf().set(CONF_COLUMN_NAME, columnName);

        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);

        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
        }

        job.setInputFormatClass(ColumnFamilyInputFormat.class);

        ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        job.waitForCompletion(true);
    }
    return 0;
}

From source file:org.apache.crunch.impl.mr.plan.JobPrototype.java

License:Apache License

private CrunchControlledJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline, int numOfJobs)
        throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();/*from   ww  w.j av  a  2 s  .  c o m*/
    conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
    job.setJarByClass(jarClass);

    Set<DoNode> outputNodes = Sets.newHashSet();
    Path outputPath = new Path(workingPath, "output");
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
    for (Target target : targetsToNodePaths.keySet()) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PType<?> ptype = nodePath.tail().getPType();
                node = DoNode.createOutputNode(target.toString(), target.getConverter(ptype), ptype);
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }

    Set<DoNode> mapSideNodes = Sets.newHashSet();
    if (mapSideNodePaths != null) {
        for (Target target : mapSideNodePaths.keySet()) {
            DoNode node = null;
            for (NodePath nodePath : mapSideNodePaths.get(target)) {
                if (node == null) {
                    PType<?> ptype = nodePath.tail().getPType();
                    node = DoNode.createOutputNode(target.toString(), target.getConverter(ptype), ptype);
                    outputHandler.configureNode(node, target);
                }
                mapSideNodes.add(walkPath(nodePath.descendingIterator(), node));
            }

        }
    }

    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
        reduceNode = reduceNodes.get(0);

        if (combineFnTable != null) {
            job.setCombinerClass(CrunchCombiner.class);
            DoNode combinerInputNode = group.createDoNode();
            DoNode combineNode = combineFnTable.createCombineNode();
            combineNode.addChild(group.getGroupingNode());
            combinerInputNode.addChild(combineNode);
            serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
        }

        group.configureShuffle(job);

        DoNode mapOutputNode = group.getGroupingNode();
        Set<DoNode> mapNodes = Sets.newHashSet(mapSideNodes);
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
    } else { // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
    }
    serialize(inputNodes, conf, workingPath, NodeContext.MAP);

    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            inputNodes.get(i).getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(conf, pipeline.getName(), inputNodes, reduceNode, numOfJobs));

    return new CrunchControlledJob(jobID, job, new CrunchJobHooks.PrepareHook(job),
            new CrunchJobHooks.CompletionHook(job, outputPath, outputHandler.getMultiPaths(), group == null));
}

From source file:org.apache.druid.indexer.DeterminePartitionsJob.java

License:Apache License

@Override
public boolean run() {
    try {/*from  w  w w. j a  v  a 2  s.co m*/
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE(
                    "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
                    config.getPartitionsSpec());
        }

        final SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) config
                .getPartitionsSpec();

        if (!partitionsSpec.isAssumeGrouped()) {
            groupByJob = Job.getInstance(new Configuration(), StringUtils.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);

            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                    JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            // Store the jobId in the file
            if (groupByJob.getJobID() != null) {
                JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
            }

            try {
                if (!groupByJob.waitForCompletion(true)) {
                    log.error("Job failed: %s", groupByJob.getJobID());
                    failureCause = Utils.getFailureMessage(groupByJob, config.JSON_MAPPER);
                    return false;
                }
            } catch (IOException ioe) {
                if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob,
                        config.isUseYarnRMJobStatusFallback())) {
                    throw ioe;
                }
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), StringUtils.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);

        if (!partitionsSpec.isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob,
                DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        // Store the jobId in the file
        if (dimSelectionJob.getJobID() != null) {
            JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), dimSelectionJob.getJobID().toString());
        }

        try {
            if (!dimSelectionJob.waitForCompletion(true)) {
                log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
                failureCause = Utils.getFailureMessage(dimSelectionJob, config.JSON_MAPPER);
                return false;
            }
        } catch (IOException ioe) {
            if (!Utils.checkAppSuccessForJobIOException(ioe, dimSelectionJob,
                    config.isUseYarnRMJobStatusFallback())) {
                throw ioe;
            }
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.JSON_MAPPER.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i,
                            actualSpecs.get(i));
                }

                shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:org.apache.hadoop.examples.DBCountPageView.java

License:Apache License

@Override
//Usage DBCountPageView [driverClass dburl]
public int run(String[] args) throws Exception {

    String driverClassName = DRIVER_CLASS;
    String url = DB_URL;/*from ww  w.ja  va 2 s.co m*/

    if (args.length > 1) {
        driverClassName = args[0];
        url = args[1];
    }

    initialize(driverClassName, url);
    Configuration conf = getConf();

    DBConfiguration.configureDB(conf, driverClassName, url);

    Job job = Job.getInstance(conf);

    job.setJobName("Count Pageviews of URLs");
    job.setJarByClass(DBCountPageView.class);
    job.setMapperClass(PageviewMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(PageviewReducer.class);

    DBInputFormat.setInput(job, AccessRecord.class, "HAccess", null, "url", AccessFieldNames);

    DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(PageviewRecord.class);
    job.setOutputValueClass(NullWritable.class);
    int ret;
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;
        boolean correct = verify();
        if (!correct) {
            throw new RuntimeException("Evaluation was not correct!");
        }
    } finally {
        shutdown();
    }
    return ret;
}