Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.mrgeo.data.accumulo.output.image.AccumuloMrsImagePyramidOutputFormatProvider.java

License:Apache License

@Override
public void setupJob(final Job job) throws DataProviderException {
    try {//from www.  java2 s  .  com
        //TODO: there is an assumption here that the output is going to accumulo directly - not bulk
        super.setupJob(job);

        job.getConfiguration().addResource(AccumuloConnector.getAccumuloPropertiesLocation());

        // zoom level - output zoom level
        zoomLevel = context.getZoomlevel();
        //      zoomLevel = job.getConfiguration().getInt("zoomlevel", 0);
        if (zoomLevel != 0) {
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL,
                    Integer.toString(zoomLevel));
        }

        //job.getConfiguration().set("zoomLevel", Integer.toString(zoomLevel));
        if (doBulk) {
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
            job.getConfiguration().set(
                    MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
        } else {
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);
            job.getConfiguration().set(
                    MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);

        }
        Properties props = AccumuloConnector.getAccumuloProperties();
        if (props != null) {

            // this used to be the variable "name" in TiledOutputFormatContext, but was always "".
            String enc = AccumuloConnector.encodeAccumuloProperties("");
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_RESOURCE, enc);

            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE));
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS));

            if (props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE) == null) {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, this.table);
            } else {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE,
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE));
            }

            // username and password
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER));

            // make sure the password is set with Base64Encoding
            String pw = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD);
            String isEnc = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, "false");

            if (isEnc.equalsIgnoreCase("true")) {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD,
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD));
            } else {
                byte[] p = Base64.encodeBase64(
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD).getBytes());

                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD, new String(p));
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64,
                        new String("true"));
            }

            if (job.getConfiguration().get(MrGeoConstants.MRGEO_PROTECTION_LEVEL) != null) {
                cv = new ColumnVisibility(job.getConfiguration().get(MrGeoConstants.MRGEO_PROTECTION_LEVEL));
            }
            if (cv == null) {

                if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)) {

                    job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ,
                            props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

                    cv = new ColumnVisibility(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

                }

            } else {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ,
                        new String(cv.getExpression()));
            }

        }

        if (doBulk) {

            LongRectangle outTileBounds = tileBounds.toLongRectangle();

            // setup the output for the job
            if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR)) {
                workDir = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR);
                if (workDir != null) {
                    workDir += File.separator;
                }
            } else {
                workDir = "";
            }
            workDir += AccumuloMrsImagePyramidFileOutputFormat.class.getSimpleName() + File.separator
                    + this.table + File.separator;// +
            //            System.currentTimeMillis() +
            //            File.separator;

            // delete the work dir if possible
            //        Path wd = new Path(workDir);
            //        FileSystem fs = HadoopFileUtils.getFileSystem(wd);        
            //        if (fs.exists(wd))
            //        {
            //          fs.delete(wd, false);
            //        }

            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR, workDir);

            // determine the starting points for the splits
            ArrayList<Pair<Long, Long>> splitPoints = new ArrayList<Pair<Long, Long>>();

            // think about the multiple levels and creating other splits!!!

            long step = bulkThreshold / outTileBounds.getWidth();
            long rem = bulkThreshold % outTileBounds.getWidth();
            if (rem > 0) {
                step++;
            }
            for (long y = outTileBounds.getMinY(); y <= outTileBounds.getMaxY(); y += step) {
                Pair<Long, Long> cur = new Pair<Long, Long>(outTileBounds.getMinX(), y);
                splitPoints.add(cur);
            }

            // we now have our list of split points
            // now build the splits file!!!
            FileSystem fs = null;
            //FileSystem.get(job.getConfiguration());
            PrintStream out = null;

            try {
                Path wd = new Path(workDir);
                fs = FileSystem.get(job.getConfiguration());
                if (fs.exists(wd)) {
                    fs.delete(wd, true);
                }

                out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "splits.txt"))));

                for (Pair<Long, Long> p : splitPoints) {
                    long split = TMSUtils.tileid(p.getFirst(), p.getSecond(), zoomLevel);
                    //TileIdWritable t = new TileIdWritable(split);
                    Text t = new Text(longToBytes(split));
                    out.println(new String(Base64.encodeBase64(TextUtil.getBytes(t))));
                    log.debug("Point: " + p.getFirst() + "\t" + p.getSecond() + "\t" + split + "\t"
                            + t.getLength());
                }

                job.setNumReduceTasks(splitPoints.size() + 1);
                out.close();

                job.setPartitionerClass(AccumuloMrGeoRangePartitioner.class);
                AccumuloMrGeoRangePartitioner.setSplitFile(job, workDir + "splits.txt");

            } catch (IOException ioe) {
                ioe.printStackTrace();
                throw new DataProviderException(
                        "Problem creating output splits.txt for bulk ingest directory.");
            }

            job.setOutputFormatClass(AccumuloMrsImagePyramidFileOutputFormat.class);

            AccumuloMrsImagePyramidFileOutputFormat.setOutputPath(job, new Path(workDir + "files"));
            //AccumuloMrsImagePyramidFileOutputFormat.setZoomLevel(zoomLevel);

        } else {

            log.info("Setting the output format of: "
                    + AccumuloMrsImagePyramidOutputFormat.class.getCanonicalName());

            job.setOutputFormatClass(AccumuloMrsImagePyramidOutputFormat.class);
            AccumuloMrsImagePyramidOutputFormat.setJob(job);

            log.info("Setting zoom level to " + zoomLevel);
            log.info("Visibility is " + cv.toString());
            log.info("Setting the number of reducers to " + MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
            job.setNumReduceTasks(MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
        }

        job.setOutputKeyClass(TileIdWritable.class);
        job.setOutputValueClass(RasterWritable.class);

    } catch (IOException ioe) {
        throw new DataProviderException("Error running job setup", ioe);
    }

}

From source file:org.mrgeo.data.accumulo.output.image.AccumuloMrsPyramidOutputFormatProvider.java

License:Apache License

@SuppressWarnings("squid:S2095") // hadoop FileSystem cannot be closed, or else subsequent uses will fail
private void setupConfig(final Configuration conf, final Job job) throws DataProviderException {
    try {//from w ww  . j a va2s.c  om
        // zoom level - output zoom level
        zoomLevel = context.getZoomLevel();
        //      zoomLevel = conf.getInt("zoomlevel", 0);
        if (zoomLevel != 0) {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL, Integer.toString(zoomLevel));
        }

        //conf.set("zoomLevel", Integer.toString(zoomLevel));
        if (doBulk || forceBulk) {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
        } else {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);

        }
        Properties props = AccumuloConnector.getAccumuloProperties();

        // this used to be the variable "name" in ImageOutputFormatContext, but was always "".
        String enc = AccumuloConnector.encodeAccumuloProperties("");
        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_RESOURCE, enc);

        //        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE,
        //                 props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE));
        //        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS,
        //                 props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS));

        if (props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE) == null) {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, this.table);
        } else {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE));
        }

        //        // username and password
        //        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER,
        //                 props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER));
        //
        //        // make sure the password is set with Base64Encoding
        //        String pw = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD);
        //        String isEnc = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, "false");
        //
        //        if(isEnc.equalsIgnoreCase("true")){
        //          conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD,
        //                   props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD));
        //        } else {
        //          byte[] p = Base64.encodeBase64(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD).getBytes());
        //
        //          conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD,
        //                   new String(p));
        //          conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64,
        //                   new String("true"));
        //        }

        if (conf.get(MrGeoConstants.MRGEO_PROTECTION_LEVEL) != null) {
            cv = new ColumnVisibility(conf.get(MrGeoConstants.MRGEO_PROTECTION_LEVEL));
        }
        if (cv == null) {

            if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)) {

                conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ,
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

                cv = new ColumnVisibility(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

            }

        } else {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ, new String(cv.getExpression()));
        }

        if (doBulk || forceBulk) {

            LongRectangle outTileBounds = tileBounds.toLongRectangle();

            // setup the output for the job
            if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR)) {
                workDir = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR);
                if (workDir != null) {
                    workDir += File.separator;
                }
            } else {
                workDir = "";
            }
            workDir += AccumuloMrsPyramidFileOutputFormat.class.getSimpleName() + File.separator + this.table
                    + File.separator;// +
            //            System.currentTimeMillis() +
            //            File.separator;

            // delete the work dir if possible
            Path wd = new Path(workDir);
            FileSystem fs = FileSystem.get(conf);
            if (fs.exists(wd)) {
                fs.delete(wd, true);
            }

            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR, workDir);

            if (job != null) {
                // determine the starting points for the splits
                ArrayList<Pair<Long, Long>> splitPoints = new ArrayList<Pair<Long, Long>>();

                // think about the multiple levels and creating other splits!!!

                long step = bulkThreshold / outTileBounds.getWidth();
                long rem = bulkThreshold % outTileBounds.getWidth();
                if (rem > 0) {
                    step++;
                }
                for (long y = outTileBounds.getMinY(); y <= outTileBounds.getMaxY(); y += step) {
                    Pair<Long, Long> cur = new Pair<Long, Long>(outTileBounds.getMinX(), y);
                    splitPoints.add(cur);
                }

                // we now have our list of split points
                // now build the splits file!!!
                try (BufferedOutputStream bos = new BufferedOutputStream(
                        fs.create(new Path(workDir + "splits.txt")))) {
                    try (PrintStream out = new PrintStream(bos)) {
                        for (Pair<Long, Long> p : splitPoints) {
                            long split = TMSUtils.tileid(p.getFirst(), p.getSecond(), zoomLevel);
                            //TileIdWritable t = new TileIdWritable(split);
                            Text t = new Text(longToBytes(split));
                            out.println(Base64Utils.encodeObject(t.toString()));
                            log.debug("Point: " + p.getFirst() + "\t" + p.getSecond() + "\t" + split + "\t"
                                    + t.getLength());
                        }

                        job.setNumReduceTasks(splitPoints.size() + 1);
                        out.close();

                        job.setPartitionerClass(AccumuloMrGeoRangePartitioner.class);
                        AccumuloMrGeoRangePartitioner.setSplitFile(job, workDir + "splits.txt");

                    }
                } catch (IOException ioe) {
                    throw new DataProviderException(
                            "Problem creating output splits.txt for bulk ingest directory.", ioe);
                }

                job.setOutputFormatClass(AccumuloMrsPyramidFileOutputFormat.class);
            }
            Path workFilesPath = new Path(workDir + "files");
            if (job != null) {
                AccumuloMrsPyramidFileOutputFormat.setOutputPath(job, workFilesPath);
                //AccumuloMrsPyramidFileOutputFormat.setZoomLevel(zoomLevel);
            } else {
                Path outputDir = workFilesPath.getFileSystem(conf).makeQualified(workFilesPath);
                //          conf.set(AccumuloMrsPyramidFileOutputFormat.OUTDIR, outputDir.toString());
                conf.set("mapred.output.dir", outputDir.toString());
                conf.set("mapreduce.output.fileoutputformat.outputdir", outputDir.toString());
            }

        } else {
            if (job != null) {
                log.info("Setting the output format of: "
                        + AccumuloMrsPyramidOutputFormat.class.getCanonicalName());

                job.setOutputFormatClass(AccumuloMrsPyramidOutputFormat.class);
                AccumuloMrsPyramidOutputFormat.setJob(job);

                log.info("Setting zoom level to " + zoomLevel);
                log.info("Visibility is " + cv.toString());
                log.info("Setting the number of reducers to "
                        + MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
                job.setNumReduceTasks(MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
            }
        }

        if (job != null) {
            job.setOutputKeyClass(TileIdWritable.class);
            job.setOutputValueClass(RasterWritable.class);
        }

    } catch (IOException ioe) {
        throw new DataProviderException("Error running job setup", ioe);
    }

}

From source file:org.mrgeo.hdfs.partitioners.TileIdPartitioner.java

License:Apache License

public static Path setup(final Job job, final SplitGenerator splitGenerator) throws IOException {
    // don't set up a partitioner in local mode
    if (HadoopUtils.isLocal(job.getConfiguration())) {
        // make sure we have at least 1 reducer...
        if (job.getNumReduceTasks() < 1) {
            job.setNumReduceTasks(1);//from   www  .  j  a  v  a 2 s.co  m
        }
        return null;
    }

    PartitionerSplit splits = new PartitionerSplit();

    splits.generateSplits(splitGenerator);

    // create a split file in the hadoop tmp directory
    // this is copied into the job's output directory upon job completion
    final int uniquePrefixLen = 5;
    Path splitFile = new Path(HadoopFileUtils.getTempDir(job.getConfiguration()),
            HadoopUtils.createRandomString(uniquePrefixLen) + "_" + PartitionerSplit.SPLIT_FILE);

    splits.writeSplits(splitFile);

    job.setNumReduceTasks(splits.length());
    job.setPartitionerClass(TileIdPartitioner.class);

    setSplitFile(splitFile.toString(), job);

    return splitFile;
}

From source file:org.myorg.KSorter.java

License:Open Source License

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    Job job = new Job(conf, "parallelsort");

    job.setInputFormatClass(KeyValueTextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setPartitionerClass(ScorePartitioner.class);

    job.setJarByClass(KSorter.class);
    job.setMapperClass(SortMapper.class);
    job.setReducerClass(SortReducer.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);//from w ww. ja v  a  2 s  .  c  o  m
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runDictionaryJob()
        throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
    boolean jobOK;
    Job job = null;
    BufferedWriter bufferedWriter;

    // if output path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
        if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
            System.out.println("Select other path or use option -dd to overwrite");
            System.exit(-1);/*from  w w  w. j  a  v  a2 s.  c  o m*/
        }
    }

    // Sample the SequenceInputFormat to do TotalSort and create final output
    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    System.out.println("samples = " + this.conf.getDictionarySamplesPath());
    System.out.println("output = " + this.conf.getDictionaryOutputPath());

    FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    // Identity Mapper
    // job.setMapperClass(Mapper.class);
    job.setCombinerClass(DictionaryCombiner.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setReducerClass(DictionaryReducer.class);

    job.setNumReduceTasks(this.conf.getDictionaryReducers());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
    this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
    this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
    this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

    bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runTriplesJob()
        throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Job job = null;
    boolean jobOK;

    // if triples output path exists...
    if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
        if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
            this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
            System.out.println("Select other path or use option -dt to overwrite");
            System.exit(-1);/*from   www  .  j a v a 2s . c  o  m*/
        }
    }

    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setGroupingComparatorClass(TripleSPOComparator.class);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    return jobOK;
}

From source file:org.seqdoop.hadoop_bam.cli.plugins.FixMate.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("fixmate :: WORKDIR not given.");
        return 3;
    }//from   w  ww. j  a v  a2s  . c o m
    if (args.size() == 1) {
        System.err.println("fixmate :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final ValidationStringency stringency = Utils.toStringency(
            parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()),
            "fixmate");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    final boolean globalSort = parser.getBoolean(sortOpt);
    if (globalSort)
        Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname);

    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        if (globalSort)
            Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(FixMate.class);
        job.setMapperClass(FixMateMapper.class);
        job.setReducerClass(FixMateReducer.class);

        if (!parser.getBoolean(noCombinerOpt))
            job.setCombinerClass(FixMateReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(AnySAMInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        if (globalSort) {
            job.setPartitionerClass(TotalOrderPartitioner.class);

            System.out.println("fixmate :: Sampling...");
            t.start();

            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                    new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                            Math.max(100, reduceTasks)));

            System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());
        }

        job.submit();

        System.out.println("fixmate :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("fixmate :: Job failed.");
            return 4;
        }

        System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("fixmate :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate");
        } catch (IOException e) {
            System.err.printf("fixmate :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:org.seqdoop.hadoop_bam.cli.plugins.Sort.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("sort :: WORKDIR not given.");
        return 3;
    }//from ww w .  j  a  v a2s .co  m
    if (args.size() == 1) {
        System.err.println("sort :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final ValidationStringency stringency = Utils.toStringency(
            parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "sort");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    Utils.setHeaderMergerSortOrder(conf, SortOrder.coordinate);
    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(Sort.class);
        job.setMapperClass(Mapper.class);
        job.setReducerClass(SortReducer.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(SortInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        job.setPartitionerClass(TotalOrderPartitioner.class);

        System.out.println("sort :: Sampling...");
        t.start();

        InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                        Math.max(100, reduceTasks)));

        System.out.printf("sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());

        job.submit();

        System.out.println("sort :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("sort :: Job failed.");
            return 4;
        }

        System.out.printf("sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("sort :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "sort");
        } catch (IOException e) {
            System.err.printf("sort :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:org.sifarish.common.AttributeBasedDiversifier.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Attribute based diversifer for ranked and  recommended items  MR";
    job.setJobName(jobName);//  w  w  w  . j  ava2 s.c om

    job.setJarByClass(AttributeBasedDiversifier.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(AttributeBasedDiversifier.AttributeDiversifierMapper.class);
    job.setReducerClass(AttributeBasedDiversifier.AttributeDiversifierReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("abd.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.sifarish.common.BusinessGoalInjector.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Business goal injector MR";
    job.setJobName(jobName);//w  ww.j a v a  2 s.com

    job.setJarByClass(BusinessGoalInjector.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(BusinessGoalInjector.BusinessGoalMapper.class);
    job.setReducerClass(BusinessGoalInjector.BusinessGoalReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("bgi.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}