Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:nl.utwente.trafficanalyzer.SensorCountPerRoadPerDay.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(SensorCountPerRoadPerDay.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {//from w  w w . j  ava  2  s .c om
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);
    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:org.acacia.csr.java.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*/*from   w w w .j  av a 2 s .  com*/
    String dir1 = "/user/miyuru/wcout";
     //We first delete the temporary directories if they exist on the HDFS
      FileSystem fs1 = FileSystem.get(new JobConf());
              
     if(fs1.exists(new Path(dir1))){
        fs1.delete(new Path(dir1), true);
     }
            
    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);
            
    job.setSortComparatorClass(SortComparator.class);
    FileInputFormat.addInputPath(job, new Path("/user/miyuru/input"));
    FileOutputFormat.setOutputPath(job, new Path(dir1));
    job.waitForCompletion(true); 
    */

    String dir3 = "/user/miyuru/wcout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs3 = FileSystem.get(new JobConf());

    if (fs3.exists(new Path(dir3))) {
        fs3.delete(new Path(dir3), true);
    }

    JobConf conf3 = new JobConf();
    conf3.setNumMapTasks(96);
    FileInputFormat.addInputPath(conf3, new Path(dir5));
    FileOutputFormat.setOutputPath(conf3, new Path(dir3));
    Job job3 = new Job(conf3, "word count");
    job3.setJarByClass(WordCount.class);
    job3.setMapperClass(TokenizerMapper.class);
    job3.setCombinerClass(IntSumReducer.class);
    job3.setReducerClass(IntSumReducer.class);
    job3.setOutputKeyClass(LongWritable.class);
    job3.setOutputValueClass(LongWritable.class);

    job3.setSortComparatorClass(SortComparator.class);

    job3.waitForCompletion(true);

    PrintWriter writer;
    try {
        writer = new PrintWriter("/tmp/wfile", "UTF-8");
        writer.println("");
        writer.flush();
        writer.close();
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (UnsupportedEncodingException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    System.out.println("------Done Word Count---------------");

}

From source file:org.acacia.csr.java.ZeroVertexSearcher.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*//from   ww w .  j  a  va  2s . c  om
    String dir1 = "/user/miyuru/wcout";
     //We first delete the temporary directories if they exist on the HDFS
      FileSystem fs1 = FileSystem.get(new JobConf());
              
     if(fs1.exists(new Path(dir1))){
        fs1.delete(new Path(dir1), true);
     }
            
    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);
            
    job.setSortComparatorClass(SortComparator.class);
    FileInputFormat.addInputPath(job, new Path("/user/miyuru/input"));
    FileOutputFormat.setOutputPath(job, new Path(dir1));
    job.waitForCompletion(true); 
    */

    String dir3 = "/user/miyuru/zout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs3 = FileSystem.get(new JobConf());

    if (fs3.exists(new Path(dir3))) {
        fs3.delete(new Path(dir3), true);
    }

    JobConf conf3 = new JobConf();
    conf3.setNumMapTasks(96);
    FileInputFormat.addInputPath(conf3, new Path(dir5));
    FileOutputFormat.setOutputPath(conf3, new Path(dir3));
    conf3.set("mapred.map.max.attempts", "0");//If the job fails we assume that it happens because we found zero. Therfore we do not attempt again.
    Job job3 = new Job(conf3, "zero_vertex_search");
    job3.setJarByClass(ZeroVertexSearcher.class);
    job3.setMapperClass(TokenizerMapper.class);
    job3.setCombinerClass(IntSumReducer.class);
    job3.setReducerClass(IntSumReducer.class);
    job3.setOutputKeyClass(LongWritable.class);
    job3.setOutputValueClass(LongWritable.class);
    job3.setNumReduceTasks(0);

    job3.setSortComparatorClass(SortComparator.class);
    try {
        job3.waitForCompletion(true);
    } catch (org.acacia.csr.java.ZeroFoundException ex) {
        System.out.println("Found Zero vertex");
        job3.killJob();
    }
    System.out.println("------Done Zero Vertex search---------------");

}

From source file:org.apache.accumulo.examples.mapreduce.UniqueColumns.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Opts opts = new Opts();
    opts.parseArgs(UniqueColumns.class.getName(), args);

    String jobName = this.getClass().getSimpleName() + "_" + System.currentTimeMillis();

    Job job = Job.getInstance(getConf());
    job.setJobName(jobName);/*from   w  ww.  ja va  2  s.  c  om*/
    job.setJarByClass(this.getClass());

    String clone = opts.getTableName();
    Connector conn = null;

    opts.setAccumuloConfigs(job);

    if (opts.offline) {
        /*
         * this example clones the table and takes it offline. If you plan to run map reduce jobs over a table many times, it may be more efficient to compact the
         * table, clone it, and then keep using the same clone as input for map reduce.
         */

        conn = opts.getConnector();
        clone = opts.getTableName() + "_" + jobName;
        conn.tableOperations().clone(opts.getTableName(), clone, true, new HashMap<String, String>(),
                new HashSet<String>());
        conn.tableOperations().offline(clone);

        AccumuloInputFormat.setOfflineTableScan(job, true);
        AccumuloInputFormat.setInputTableName(job, clone);
    }

    job.setInputFormatClass(AccumuloInputFormat.class);

    job.setMapperClass(UMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(UReducer.class);
    job.setReducerClass(UReducer.class);

    job.setNumReduceTasks(opts.reducers);

    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(opts.output));

    job.waitForCompletion(true);

    if (opts.offline) {
        conn.tableOperations().delete(clone);
    }

    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.accumulo.examples.simple.mapreduce.UniqueColumns.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Opts opts = new Opts();
    opts.parseArgs(UniqueColumns.class.getName(), args);

    String jobName = this.getClass().getSimpleName() + "_" + System.currentTimeMillis();

    Job job = JobUtil.getJob(getConf());
    job.setJobName(jobName);/*www .  ja v  a2 s .c  o m*/
    job.setJarByClass(this.getClass());

    String clone = opts.getTableName();
    Connector conn = null;

    opts.setAccumuloConfigs(job);

    if (opts.offline) {
        /*
         * this example clones the table and takes it offline. If you plan to run map reduce jobs over a table many times, it may be more efficient to compact the
         * table, clone it, and then keep using the same clone as input for map reduce.
         */

        conn = opts.getConnector();
        clone = opts.getTableName() + "_" + jobName;
        conn.tableOperations().clone(opts.getTableName(), clone, true, new HashMap<String, String>(),
                new HashSet<String>());
        conn.tableOperations().offline(clone);

        AccumuloInputFormat.setOfflineTableScan(job, true);
        AccumuloInputFormat.setInputTableName(job, clone);
    }

    job.setInputFormatClass(AccumuloInputFormat.class);

    job.setMapperClass(UMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(UReducer.class);
    job.setReducerClass(UReducer.class);

    job.setNumReduceTasks(opts.reducers);

    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(opts.output));

    job.waitForCompletion(true);

    if (opts.offline) {
        conn.tableOperations().delete(clone);
    }

    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.airavata.gfac.hadoop.provider.impl.HadoopProvider.java

License:Apache License

public void execute(JobExecutionContext jobExecutionContext) throws GFacProviderException {
    HadoopApplicationDeploymentDescriptionType hadoopAppDesc = (HadoopApplicationDeploymentDescriptionType) jobExecutionContext
            .getApplicationContext().getApplicationDeploymentDescription().getType();
    MessageContext inMessageContext = jobExecutionContext.getInMessageContext();
    HadoopApplicationDeploymentDescriptionType.HadoopJobConfiguration jobConf = hadoopAppDesc
            .getHadoopJobConfiguration();

    try {/*from  w  w w . j a  v a 2 s .  c  om*/
        // Preparing Hadoop configuration
        Configuration hadoopConf = HadoopUtils.createHadoopConfiguration(jobExecutionContext,
                isWhirrBasedDeployment, hadoopConfigDir);

        // Load jar containing map-reduce job implementation
        ArrayList<URL> mapRedJars = new ArrayList<URL>();
        mapRedJars.add(new File(jobConf.getJarLocation()).toURL());
        URLClassLoader childClassLoader = new URLClassLoader(mapRedJars.toArray(new URL[mapRedJars.size()]),
                this.getClass().getClassLoader());

        Job job = new Job(hadoopConf);

        job.setJobName(jobConf.getJobName());

        job.setOutputKeyClass(Class.forName(jobConf.getOutputKeyClass(), true, childClassLoader));
        job.setOutputValueClass(Class.forName(jobConf.getOutputValueClass(), true, childClassLoader));

        job.setMapperClass(
                (Class<? extends Mapper>) Class.forName(jobConf.getMapperClass(), true, childClassLoader));
        job.setCombinerClass(
                (Class<? extends Reducer>) Class.forName(jobConf.getCombinerClass(), true, childClassLoader));
        job.setReducerClass(
                (Class<? extends Reducer>) Class.forName(jobConf.getCombinerClass(), true, childClassLoader));

        job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(jobConf.getInputFormatClass(),
                true, childClassLoader));
        job.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(jobConf.getOutputFormatClass(),
                true, childClassLoader));

        FileInputFormat.setInputPaths(job, new Path(hadoopAppDesc.getInputDataDirectory()));
        FileOutputFormat.setOutputPath(job, new Path(hadoopAppDesc.getOutputDataDirectory()));

        job.waitForCompletion(true);
        System.out.println(job.getTrackingURL());
        if (jobExecutionContext.getOutMessageContext() == null) {
            jobExecutionContext.setOutMessageContext(new MessageContext());
        }

        OutputParameterType[] outputParametersArray = jobExecutionContext.getApplicationContext()
                .getServiceDescription().getType().getOutputParametersArray();
        for (OutputParameterType outparamType : outputParametersArray) {
            String paramName = outparamType.getParameterName();
            if (paramName.equals("test-hadoop")) {
                ActualParameter outParam = new ActualParameter();
                outParam.getType().changeType(StringParameterType.type);
                ((StringParameterType) outParam.getType()).setValue(job.getTrackingURL());
                jobExecutionContext.getOutMessageContext().addParameter("test-hadoop", outParam);
            }
        }
    } catch (Exception e) {
        String errMessage = "Error occurred during Map-Reduce job execution.";
        logger.error(errMessage, e);
        throw new GFacProviderException(errMessage, e);
    }
}

From source file:org.apache.cassandra.example.hadoop.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    String outputReducerType = "filesystem";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];//ww  w  . ja va  2  s  .com
    }
    logger.info("output reducer type: " + outputReducerType);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "text" + i;
        getConf().set(CONF_COLUMN_NAME, columnName);

        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);

        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
        }

        job.setInputFormatClass(ColumnFamilyInputFormat.class);

        ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        job.waitForCompletion(true);
    }
    return 0;
}

From source file:org.apache.crunch.impl.mr.plan.JobPrototype.java

License:Apache License

private CrunchControlledJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline, int numOfJobs)
        throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();/*from   ww  w.j av  a  2 s  .  c o m*/
    conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
    job.setJarByClass(jarClass);

    Set<DoNode> outputNodes = Sets.newHashSet();
    Path outputPath = new Path(workingPath, "output");
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
    for (Target target : targetsToNodePaths.keySet()) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PType<?> ptype = nodePath.tail().getPType();
                node = DoNode.createOutputNode(target.toString(), target.getConverter(ptype), ptype);
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }

    Set<DoNode> mapSideNodes = Sets.newHashSet();
    if (mapSideNodePaths != null) {
        for (Target target : mapSideNodePaths.keySet()) {
            DoNode node = null;
            for (NodePath nodePath : mapSideNodePaths.get(target)) {
                if (node == null) {
                    PType<?> ptype = nodePath.tail().getPType();
                    node = DoNode.createOutputNode(target.toString(), target.getConverter(ptype), ptype);
                    outputHandler.configureNode(node, target);
                }
                mapSideNodes.add(walkPath(nodePath.descendingIterator(), node));
            }

        }
    }

    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
        reduceNode = reduceNodes.get(0);

        if (combineFnTable != null) {
            job.setCombinerClass(CrunchCombiner.class);
            DoNode combinerInputNode = group.createDoNode();
            DoNode combineNode = combineFnTable.createCombineNode();
            combineNode.addChild(group.getGroupingNode());
            combinerInputNode.addChild(combineNode);
            serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
        }

        group.configureShuffle(job);

        DoNode mapOutputNode = group.getGroupingNode();
        Set<DoNode> mapNodes = Sets.newHashSet(mapSideNodes);
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
    } else { // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
    }
    serialize(inputNodes, conf, workingPath, NodeContext.MAP);

    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            inputNodes.get(i).getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(conf, pipeline.getName(), inputNodes, reduceNode, numOfJobs));

    return new CrunchControlledJob(jobID, job, new CrunchJobHooks.PrepareHook(job),
            new CrunchJobHooks.CompletionHook(job, outputPath, outputHandler.getMultiPaths(), group == null));
}

From source file:org.apache.druid.indexer.DeterminePartitionsJob.java

License:Apache License

@Override
public boolean run() {
    try {/*from  w  w w. j a  v  a 2  s.co m*/
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE(
                    "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
                    config.getPartitionsSpec());
        }

        final SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) config
                .getPartitionsSpec();

        if (!partitionsSpec.isAssumeGrouped()) {
            groupByJob = Job.getInstance(new Configuration(), StringUtils.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);

            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                    JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            // Store the jobId in the file
            if (groupByJob.getJobID() != null) {
                JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
            }

            try {
                if (!groupByJob.waitForCompletion(true)) {
                    log.error("Job failed: %s", groupByJob.getJobID());
                    failureCause = Utils.getFailureMessage(groupByJob, config.JSON_MAPPER);
                    return false;
                }
            } catch (IOException ioe) {
                if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob,
                        config.isUseYarnRMJobStatusFallback())) {
                    throw ioe;
                }
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), StringUtils.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);

        if (!partitionsSpec.isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob,
                DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()),
                JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        // Store the jobId in the file
        if (dimSelectionJob.getJobID() != null) {
            JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), dimSelectionJob.getJobID().toString());
        }

        try {
            if (!dimSelectionJob.waitForCompletion(true)) {
                log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
                failureCause = Utils.getFailureMessage(dimSelectionJob, config.JSON_MAPPER);
                return false;
            }
        } catch (IOException ioe) {
            if (!Utils.checkAppSuccessForJobIOException(ioe, dimSelectionJob,
                    config.isUseYarnRMJobStatusFallback())) {
                throw ioe;
            }
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.JSON_MAPPER.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i,
                            actualSpecs.get(i));
                }

                shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:org.apache.hadoop.examples.DBCountPageView.java

License:Apache License

@Override
//Usage DBCountPageView [driverClass dburl]
public int run(String[] args) throws Exception {

    String driverClassName = DRIVER_CLASS;
    String url = DB_URL;/*from ww  w.ja  va 2 s.co m*/

    if (args.length > 1) {
        driverClassName = args[0];
        url = args[1];
    }

    initialize(driverClassName, url);
    Configuration conf = getConf();

    DBConfiguration.configureDB(conf, driverClassName, url);

    Job job = Job.getInstance(conf);

    job.setJobName("Count Pageviews of URLs");
    job.setJarByClass(DBCountPageView.class);
    job.setMapperClass(PageviewMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(PageviewReducer.class);

    DBInputFormat.setInput(job, AccessRecord.class, "HAccess", null, "url", AccessFieldNames);

    DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(PageviewRecord.class);
    job.setOutputValueClass(NullWritable.class);
    int ret;
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;
        boolean correct = verify();
        if (!correct) {
            throw new RuntimeException("Evaluation was not correct!");
        }
    } finally {
        shutdown();
    }
    return ret;
}