Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.mrgeo.data.accumulo.output.image.AccumuloMrsImagePyramidOutputFormatProvider.java

License:Apache License

@Override
public void setupJob(final Job job) throws DataProviderException {
    try {//from www.  java2 s  .  com
        //TODO: there is an assumption here that the output is going to accumulo directly - not bulk
        super.setupJob(job);

        job.getConfiguration().addResource(AccumuloConnector.getAccumuloPropertiesLocation());

        // zoom level - output zoom level
        zoomLevel = context.getZoomlevel();
        //      zoomLevel = job.getConfiguration().getInt("zoomlevel", 0);
        if (zoomLevel != 0) {
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL,
                    Integer.toString(zoomLevel));
        }

        //job.getConfiguration().set("zoomLevel", Integer.toString(zoomLevel));
        if (doBulk) {
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
            job.getConfiguration().set(
                    MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
        } else {
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);
            job.getConfiguration().set(
                    MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);

        }
        Properties props = AccumuloConnector.getAccumuloProperties();
        if (props != null) {

            // this used to be the variable "name" in TiledOutputFormatContext, but was always "".
            String enc = AccumuloConnector.encodeAccumuloProperties("");
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_RESOURCE, enc);

            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE));
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS));

            if (props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE) == null) {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, this.table);
            } else {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE,
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE));
            }

            // username and password
            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER));

            // make sure the password is set with Base64Encoding
            String pw = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD);
            String isEnc = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, "false");

            if (isEnc.equalsIgnoreCase("true")) {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD,
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD));
            } else {
                byte[] p = Base64.encodeBase64(
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD).getBytes());

                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD, new String(p));
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64,
                        new String("true"));
            }

            if (job.getConfiguration().get(MrGeoConstants.MRGEO_PROTECTION_LEVEL) != null) {
                cv = new ColumnVisibility(job.getConfiguration().get(MrGeoConstants.MRGEO_PROTECTION_LEVEL));
            }
            if (cv == null) {

                if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)) {

                    job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ,
                            props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

                    cv = new ColumnVisibility(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

                }

            } else {
                job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ,
                        new String(cv.getExpression()));
            }

        }

        if (doBulk) {

            LongRectangle outTileBounds = tileBounds.toLongRectangle();

            // setup the output for the job
            if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR)) {
                workDir = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR);
                if (workDir != null) {
                    workDir += File.separator;
                }
            } else {
                workDir = "";
            }
            workDir += AccumuloMrsImagePyramidFileOutputFormat.class.getSimpleName() + File.separator
                    + this.table + File.separator;// +
            //            System.currentTimeMillis() +
            //            File.separator;

            // delete the work dir if possible
            //        Path wd = new Path(workDir);
            //        FileSystem fs = HadoopFileUtils.getFileSystem(wd);        
            //        if (fs.exists(wd))
            //        {
            //          fs.delete(wd, false);
            //        }

            job.getConfiguration().set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR, workDir);

            // determine the starting points for the splits
            ArrayList<Pair<Long, Long>> splitPoints = new ArrayList<Pair<Long, Long>>();

            // think about the multiple levels and creating other splits!!!

            long step = bulkThreshold / outTileBounds.getWidth();
            long rem = bulkThreshold % outTileBounds.getWidth();
            if (rem > 0) {
                step++;
            }
            for (long y = outTileBounds.getMinY(); y <= outTileBounds.getMaxY(); y += step) {
                Pair<Long, Long> cur = new Pair<Long, Long>(outTileBounds.getMinX(), y);
                splitPoints.add(cur);
            }

            // we now have our list of split points
            // now build the splits file!!!
            FileSystem fs = null;
            //FileSystem.get(job.getConfiguration());
            PrintStream out = null;

            try {
                Path wd = new Path(workDir);
                fs = FileSystem.get(job.getConfiguration());
                if (fs.exists(wd)) {
                    fs.delete(wd, true);
                }

                out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "splits.txt"))));

                for (Pair<Long, Long> p : splitPoints) {
                    long split = TMSUtils.tileid(p.getFirst(), p.getSecond(), zoomLevel);
                    //TileIdWritable t = new TileIdWritable(split);
                    Text t = new Text(longToBytes(split));
                    out.println(new String(Base64.encodeBase64(TextUtil.getBytes(t))));
                    log.debug("Point: " + p.getFirst() + "\t" + p.getSecond() + "\t" + split + "\t"
                            + t.getLength());
                }

                job.setNumReduceTasks(splitPoints.size() + 1);
                out.close();

                job.setPartitionerClass(AccumuloMrGeoRangePartitioner.class);
                AccumuloMrGeoRangePartitioner.setSplitFile(job, workDir + "splits.txt");

            } catch (IOException ioe) {
                ioe.printStackTrace();
                throw new DataProviderException(
                        "Problem creating output splits.txt for bulk ingest directory.");
            }

            job.setOutputFormatClass(AccumuloMrsImagePyramidFileOutputFormat.class);

            AccumuloMrsImagePyramidFileOutputFormat.setOutputPath(job, new Path(workDir + "files"));
            //AccumuloMrsImagePyramidFileOutputFormat.setZoomLevel(zoomLevel);

        } else {

            log.info("Setting the output format of: "
                    + AccumuloMrsImagePyramidOutputFormat.class.getCanonicalName());

            job.setOutputFormatClass(AccumuloMrsImagePyramidOutputFormat.class);
            AccumuloMrsImagePyramidOutputFormat.setJob(job);

            log.info("Setting zoom level to " + zoomLevel);
            log.info("Visibility is " + cv.toString());
            log.info("Setting the number of reducers to " + MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
            job.setNumReduceTasks(MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
        }

        job.setOutputKeyClass(TileIdWritable.class);
        job.setOutputValueClass(RasterWritable.class);

    } catch (IOException ioe) {
        throw new DataProviderException("Error running job setup", ioe);
    }

}

From source file:org.mrgeo.data.accumulo.output.image.AccumuloMrsPyramidOutputFormatProvider.java

License:Apache License

@SuppressWarnings("squid:S2095") // hadoop FileSystem cannot be closed, or else subsequent uses will fail
private void setupConfig(final Configuration conf, final Job job) throws DataProviderException {
    try {//from w ww  . j a va2s.c  om
        // zoom level - output zoom level
        zoomLevel = context.getZoomLevel();
        //      zoomLevel = conf.getInt("zoomlevel", 0);
        if (zoomLevel != 0) {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL, Integer.toString(zoomLevel));
        }

        //conf.set("zoomLevel", Integer.toString(zoomLevel));
        if (doBulk || forceBulk) {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_BULK);
        } else {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_JOBTYPE,
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PREFIX + Integer.toString(zoomLevel),
                    MrGeoAccumuloConstants.MRGEO_ACC_VALUE_JOB_DIRECT);

        }
        Properties props = AccumuloConnector.getAccumuloProperties();

        // this used to be the variable "name" in ImageOutputFormatContext, but was always "".
        String enc = AccumuloConnector.encodeAccumuloProperties("");
        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_RESOURCE, enc);

        //        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE,
        //                 props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE));
        //        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS,
        //                 props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS));

        if (props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE) == null) {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE, this.table);
        } else {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE,
                    props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE));
        }

        //        // username and password
        //        conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER,
        //                 props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER));
        //
        //        // make sure the password is set with Base64Encoding
        //        String pw = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD);
        //        String isEnc = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, "false");
        //
        //        if(isEnc.equalsIgnoreCase("true")){
        //          conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD,
        //                   props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD));
        //        } else {
        //          byte[] p = Base64.encodeBase64(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD).getBytes());
        //
        //          conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD,
        //                   new String(p));
        //          conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64,
        //                   new String("true"));
        //        }

        if (conf.get(MrGeoConstants.MRGEO_PROTECTION_LEVEL) != null) {
            cv = new ColumnVisibility(conf.get(MrGeoConstants.MRGEO_PROTECTION_LEVEL));
        }
        if (cv == null) {

            if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ)) {

                conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ,
                        props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

                cv = new ColumnVisibility(props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ));

            }

        } else {
            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ, new String(cv.getExpression()));
        }

        if (doBulk || forceBulk) {

            LongRectangle outTileBounds = tileBounds.toLongRectangle();

            // setup the output for the job
            if (props.containsKey(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR)) {
                workDir = props.getProperty(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR);
                if (workDir != null) {
                    workDir += File.separator;
                }
            } else {
                workDir = "";
            }
            workDir += AccumuloMrsPyramidFileOutputFormat.class.getSimpleName() + File.separator + this.table
                    + File.separator;// +
            //            System.currentTimeMillis() +
            //            File.separator;

            // delete the work dir if possible
            Path wd = new Path(workDir);
            FileSystem fs = FileSystem.get(conf);
            if (fs.exists(wd)) {
                fs.delete(wd, true);
            }

            conf.set(MrGeoAccumuloConstants.MRGEO_ACC_KEY_WORKDIR, workDir);

            if (job != null) {
                // determine the starting points for the splits
                ArrayList<Pair<Long, Long>> splitPoints = new ArrayList<Pair<Long, Long>>();

                // think about the multiple levels and creating other splits!!!

                long step = bulkThreshold / outTileBounds.getWidth();
                long rem = bulkThreshold % outTileBounds.getWidth();
                if (rem > 0) {
                    step++;
                }
                for (long y = outTileBounds.getMinY(); y <= outTileBounds.getMaxY(); y += step) {
                    Pair<Long, Long> cur = new Pair<Long, Long>(outTileBounds.getMinX(), y);
                    splitPoints.add(cur);
                }

                // we now have our list of split points
                // now build the splits file!!!
                try (BufferedOutputStream bos = new BufferedOutputStream(
                        fs.create(new Path(workDir + "splits.txt")))) {
                    try (PrintStream out = new PrintStream(bos)) {
                        for (Pair<Long, Long> p : splitPoints) {
                            long split = TMSUtils.tileid(p.getFirst(), p.getSecond(), zoomLevel);
                            //TileIdWritable t = new TileIdWritable(split);
                            Text t = new Text(longToBytes(split));
                            out.println(Base64Utils.encodeObject(t.toString()));
                            log.debug("Point: " + p.getFirst() + "\t" + p.getSecond() + "\t" + split + "\t"
                                    + t.getLength());
                        }

                        job.setNumReduceTasks(splitPoints.size() + 1);
                        out.close();

                        job.setPartitionerClass(AccumuloMrGeoRangePartitioner.class);
                        AccumuloMrGeoRangePartitioner.setSplitFile(job, workDir + "splits.txt");

                    }
                } catch (IOException ioe) {
                    throw new DataProviderException(
                            "Problem creating output splits.txt for bulk ingest directory.", ioe);
                }

                job.setOutputFormatClass(AccumuloMrsPyramidFileOutputFormat.class);
            }
            Path workFilesPath = new Path(workDir + "files");
            if (job != null) {
                AccumuloMrsPyramidFileOutputFormat.setOutputPath(job, workFilesPath);
                //AccumuloMrsPyramidFileOutputFormat.setZoomLevel(zoomLevel);
            } else {
                Path outputDir = workFilesPath.getFileSystem(conf).makeQualified(workFilesPath);
                //          conf.set(AccumuloMrsPyramidFileOutputFormat.OUTDIR, outputDir.toString());
                conf.set("mapred.output.dir", outputDir.toString());
                conf.set("mapreduce.output.fileoutputformat.outputdir", outputDir.toString());
            }

        } else {
            if (job != null) {
                log.info("Setting the output format of: "
                        + AccumuloMrsPyramidOutputFormat.class.getCanonicalName());

                job.setOutputFormatClass(AccumuloMrsPyramidOutputFormat.class);
                AccumuloMrsPyramidOutputFormat.setJob(job);

                log.info("Setting zoom level to " + zoomLevel);
                log.info("Visibility is " + cv.toString());
                log.info("Setting the number of reducers to "
                        + MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
                job.setNumReduceTasks(MrGeoAccumuloConstants.MRGEO_DEFAULT_NUM_REDUCERS);
            }
        }

        if (job != null) {
            job.setOutputKeyClass(TileIdWritable.class);
            job.setOutputValueClass(RasterWritable.class);
        }

    } catch (IOException ioe) {
        throw new DataProviderException("Error running job setup", ioe);
    }

}

From source file:org.mrgeo.hdfs.partitioners.TileIdPartitioner.java

License:Apache License

public static Path setup(final Job job, final SplitGenerator splitGenerator) throws IOException {
    // don't set up a partitioner in local mode
    if (HadoopUtils.isLocal(job.getConfiguration())) {
        // make sure we have at least 1 reducer...
        if (job.getNumReduceTasks() < 1) {
            job.setNumReduceTasks(1);//from   www  .  j  a  v  a 2 s.co  m
        }
        return null;
    }

    PartitionerSplit splits = new PartitionerSplit();

    splits.generateSplits(splitGenerator);

    // create a split file in the hadoop tmp directory
    // this is copied into the job's output directory upon job completion
    final int uniquePrefixLen = 5;
    Path splitFile = new Path(HadoopFileUtils.getTempDir(job.getConfiguration()),
            HadoopUtils.createRandomString(uniquePrefixLen) + "_" + PartitionerSplit.SPLIT_FILE);

    splits.writeSplits(splitFile);

    job.setNumReduceTasks(splits.length());
    job.setPartitionerClass(TileIdPartitioner.class);

    setSplitFile(splitFile.toString(), job);

    return splitFile;
}

From source file:org.myorg.KSorter.java

License:Open Source License

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    Job job = new Job(conf, "parallelsort");

    job.setInputFormatClass(KeyValueTextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setPartitionerClass(ScorePartitioner.class);

    job.setJarByClass(KSorter.class);
    job.setMapperClass(SortMapper.class);
    job.setReducerClass(SortReducer.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);//from w ww. ja v  a  2 s  .  c  o  m
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runDictionaryJob()
        throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
    boolean jobOK;
    Job job = null;
    BufferedWriter bufferedWriter;

    // if output path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
        if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
            System.out.println("Select other path or use option -dd to overwrite");
            System.exit(-1);/*from  w w  w. j  a  v  a2 s.  c  o m*/
        }
    }

    // Sample the SequenceInputFormat to do TotalSort and create final output
    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    System.out.println("samples = " + this.conf.getDictionarySamplesPath());
    System.out.println("output = " + this.conf.getDictionaryOutputPath());

    FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    // Identity Mapper
    // job.setMapperClass(Mapper.class);
    job.setCombinerClass(DictionaryCombiner.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setReducerClass(DictionaryReducer.class);

    job.setNumReduceTasks(this.conf.getDictionaryReducers());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
    this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
    this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
    this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

    bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runTriplesJob()
        throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Job job = null;
    boolean jobOK;

    // if triples output path exists...
    if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
        if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
            this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
            System.out.println("Select other path or use option -dt to overwrite");
            System.exit(-1);/*from   www  .  j a v a 2s . c  o  m*/
        }
    }

    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setGroupingComparatorClass(TripleSPOComparator.class);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    return jobOK;
}

From source file:org.seqdoop.hadoop_bam.cli.plugins.FixMate.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("fixmate :: WORKDIR not given.");
        return 3;
    }//from   w  ww. j  a v  a2s  . c o m
    if (args.size() == 1) {
        System.err.println("fixmate :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final ValidationStringency stringency = Utils.toStringency(
            parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()),
            "fixmate");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    final boolean globalSort = parser.getBoolean(sortOpt);
    if (globalSort)
        Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname);

    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        if (globalSort)
            Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(FixMate.class);
        job.setMapperClass(FixMateMapper.class);
        job.setReducerClass(FixMateReducer.class);

        if (!parser.getBoolean(noCombinerOpt))
            job.setCombinerClass(FixMateReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(AnySAMInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        if (globalSort) {
            job.setPartitionerClass(TotalOrderPartitioner.class);

            System.out.println("fixmate :: Sampling...");
            t.start();

            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                    new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                            Math.max(100, reduceTasks)));

            System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());
        }

        job.submit();

        System.out.println("fixmate :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("fixmate :: Job failed.");
            return 4;
        }

        System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("fixmate :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate");
        } catch (IOException e) {
            System.err.printf("fixmate :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:org.seqdoop.hadoop_bam.cli.plugins.Sort.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("sort :: WORKDIR not given.");
        return 3;
    }//from ww w .  j  a  v a2s .co  m
    if (args.size() == 1) {
        System.err.println("sort :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final ValidationStringency stringency = Utils.toStringency(
            parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "sort");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    Utils.setHeaderMergerSortOrder(conf, SortOrder.coordinate);
    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(Sort.class);
        job.setMapperClass(Mapper.class);
        job.setReducerClass(SortReducer.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(SortInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        job.setPartitionerClass(TotalOrderPartitioner.class);

        System.out.println("sort :: Sampling...");
        t.start();

        InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                        Math.max(100, reduceTasks)));

        System.out.printf("sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());

        job.submit();

        System.out.println("sort :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("sort :: Job failed.");
            return 4;
        }

        System.out.printf("sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("sort :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "sort");
        } catch (IOException e) {
            System.err.printf("sort :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:org.sifarish.common.AttributeBasedDiversifier.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Attribute based diversifer for ranked and  recommended items  MR";
    job.setJobName(jobName);//  w  w  w  . j  ava2 s.c om

    job.setJarByClass(AttributeBasedDiversifier.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(AttributeBasedDiversifier.AttributeDiversifierMapper.class);
    job.setReducerClass(AttributeBasedDiversifier.AttributeDiversifierReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("abd.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.sifarish.common.BusinessGoalInjector.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Business goal injector MR";
    job.setJobName(jobName);//w  ww.j a v a  2 s.com

    job.setJarByClass(BusinessGoalInjector.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(BusinessGoalInjector.BusinessGoalMapper.class);
    job.setReducerClass(BusinessGoalInjector.BusinessGoalReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("bgi.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}