Example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType.

Prototype

public static void setOutputCompressionType(Job job, CompressionType style)

Source Link

Document

Set the CompressionType for the output SequenceFile .

Usage

From source file:ivory.core.preprocess.BuildIntDocVectors.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    LOG.info("PowerTool: " + BuildIntDocVectors.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));

    String termsFile = env.getIndexTermsData();
    String termIDsFile = env.getIndexTermIdsData();
    String idToTermFile = env.getIndexTermIdMappingData();

    Path termsFilePath = new Path(termsFile);
    Path termIDsFilePath = new Path(termIDsFile);

    if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) {
        LOG.error("Error, terms files don't exist!");
        return 0;
    }/*from  w ww  . j a va  2s  . com*/

    Path outputPath = new Path(env.getIntDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("IntDocVectors already exist: skipping!");
        return 0;
    }

    DistributedCache.addCacheFile(new URI(termsFile), conf);
    DistributedCache.addCacheFile(new URI(termIDsFile), conf);
    DistributedCache.addCacheFile(new URI(idToTermFile), conf);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job = new Job(conf, BuildIntDocVectors.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIntDocVectors.class);

    job.setNumReduceTasks(0);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(job, env.getTermDocVectorsDirectory());
    FileOutputFormat.setOutputPath(job, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(LazyIntDocVector.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(LazyIntDocVector.class);

    job.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.core.preprocess.BuildTermDocVectors.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);
    String collectionPath = conf.get(Constants.CollectionPath);
    String inputFormat = conf.get(Constants.InputFormat);
    String tokenizer = conf.get(Constants.Tokenizer);
    String mappingClass = conf.get(Constants.DocnoMappingClass);
    int docnoOffset = conf.getInt(Constants.DocnoOffset, 0);
    int numReducers = conf.getInt(Constants.TermDocVectorSegments, 0);

    LOG.info("PowerTool: " + BuildTermDocVectors.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath));
    LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat));
    LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer));
    LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass));
    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));
    LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!");
        return 0;
    }/*from   w  w  w .ja v a 2s.  c  o m*/

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("TermDocVectors already exist: Skipping!");
        return 0;
    }

    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job1 = new Job(conf, BuildTermDocVectors.class.getSimpleName() + ":" + collectionName);
    job1.setJarByClass(BuildTermDocVectors.class);

    job1.setNumReduceTasks(numReducers);

    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);

    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);

    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);

    job1.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // Write out number of postings.
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);

    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
        LOG.info("DocLength data exists: Skipping!");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    LOG.info("Writing doc length data to " + dlFile + "...");

    Job job2 = new Job(conf, "DocLengthTable:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors.class);

    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);

    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectors2.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    LOG.info("PowerTool: BuildIntDocVectors2");
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));

    String termsFile = env.getIndexTermsData();
    String termIDsFile = env.getIndexTermIdsData();
    String idToTermFile = env.getIndexTermIdMappingData();

    Path termsFilePath = new Path(termsFile);
    Path termIDsFilePath = new Path(termIDsFile);

    if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) {
        LOG.error("Error, terms files don't exist!");
        return 0;
    }//from   w ww. j a va2 s  . com

    Path outputPath = new Path(env.getIntDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("IntDocVectors already exist: skipping!");
        return 0;
    }

    DistributedCache.addCacheFile(new URI(termsFile), conf);
    DistributedCache.addCacheFile(new URI(termIDsFile), conf);
    DistributedCache.addCacheFile(new URI(idToTermFile), conf);

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    //conf.set("mapreduce.map.java.opts", "-Xmx2048m");

    Job job = new Job(conf, "BuildIntDocVectors2:" + collectionName);
    job.setJarByClass(BuildIntDocVectors2.class);

    job.setNumReduceTasks(0);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(job, env.getTermDocVectorsDirectory());
    FileOutputFormat.setOutputPath(job, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(LazyIntDocVector.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(LazyIntDocVector.class);

    job.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectors2.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);
    String collectionPath = conf.get(Constants.CollectionPath);
    String inputFormat = conf.get(Constants.InputFormat);
    String tokenizer = conf.get(Constants.Tokenizer);
    String mappingClass = conf.get(Constants.DocnoMappingClass);
    int docnoOffset = conf.getInt(Constants.DocnoOffset, 0);

    LOG.info("PowerTool: BuildTermDocVectors2");
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath));
    LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat));
    LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer));
    LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass));
    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
        return 0;
    }//ww  w  . ja v a  2s.  c  om

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("TermDocVectors already exist: Skipping!");
        return 0;
    }

    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);

    Job job1 = new Job(conf, "BuildTermDocVectors2:" + collectionName);
    job1.setJarByClass(BuildTermDocVectors2.class);

    job1.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);

    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);

    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);

    job1.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // write out number of postings
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);

    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
        LOG.info("DocLength data exists: Skipping!");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    LOG.info("Writing doc length data to " + dlFile + "...");

    Job job2 = new Job(conf, "DocLengthTable2:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors2.class);

    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);

    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);

    return 0;
}

From source file:nthu.scopelab.tsqr.ssvd.VJob.java

License:Apache License

public void start(Configuration conf, Path inputPathBt, Path inputUHatPath, Path inputSigmaPath,
        Path outputPath, int k, int numReduceTasks, int subRowSize, boolean vHalfSigma, int mis)
        throws ClassNotFoundException, InterruptedException, IOException {

    job = new Job(conf);
    job.setJobName("V-job");
    job.setJarByClass(VJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, inputPathBt);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    fileGather fgather = new fileGather(inputPathBt, "", fs);
    mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
    FileInputFormat.setMaxInputSplitSize(job, mis * 1024 * 1024);

    FileOutputFormat.setOutputPath(job, outputPath);

    // Warn: tight hadoop integration here:
    job.getConfiguration().set("mapreduce.output.basename", OUTPUT_V);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(LMatrixWritable.class);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LMatrixWritable.class);

    job.setMapperClass(VMapper.class);

    job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString());
    job.getConfiguration().set(PROP_SIGMA_PATH, inputSigmaPath.toString());
    if (vHalfSigma) {
        job.getConfiguration().set(PROP_V_HALFSIGMA, "y");
    }/*from  w w w .  ja va  2 s .  c  om*/
    job.getConfiguration().setInt(QJob.PROP_K, k);
    job.getConfiguration().setInt(SUB_ROW_SIZE, subRowSize);
    job.setNumReduceTasks(0);
    job.submit();
    //job.waitForCompletion(true);
}

From source file:org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester.java

License:Apache License

private int runPartitionerJob() throws Exception {
    Job partitionerJob = new Job(getConf(), "Partition Wikipedia");
    Configuration partitionerConf = partitionerJob.getConfiguration();
    partitionerConf.set("mapred.map.tasks.speculative.execution", "false");

    configurePartitionerJob(partitionerJob);

    List<Path> inputPaths = new ArrayList<>();
    SortedSet<String> languages = new TreeSet<>();
    FileSystem fs = FileSystem.get(partitionerConf);
    Path parent = new Path(partitionerConf.get("wikipedia.input"));
    listFiles(parent, fs, inputPaths, languages);

    System.out.println("Input files in " + parent + ":" + inputPaths.size());
    Path[] inputPathsArray = new Path[inputPaths.size()];
    inputPaths.toArray(inputPathsArray);

    System.out.println("Languages:" + languages.size());

    // setup input format

    WikipediaInputFormat.setInputPaths(partitionerJob, inputPathsArray);

    partitionerJob.setMapperClass(WikipediaPartitioner.class);
    partitionerJob.setNumReduceTasks(0);

    // setup output format
    partitionerJob.setMapOutputKeyClass(Text.class);
    partitionerJob.setMapOutputValueClass(Article.class);
    partitionerJob.setOutputKeyClass(Text.class);
    partitionerJob.setOutputValueClass(Article.class);
    partitionerJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    Path outputDir = WikipediaConfiguration.getPartitionedArticlesPath(partitionerConf);
    SequenceFileOutputFormat.setOutputPath(partitionerJob, outputDir);
    SequenceFileOutputFormat.setCompressOutput(partitionerJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(partitionerJob, CompressionType.RECORD);

    return partitionerJob.waitForCompletion(true) ? 0 : 1;
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.examples.HadoopWordCount2.java

License:Apache License

/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param job Configuration to change./*from  ww w  . jav a 2 s  .  c o  m*/
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */
public static void setTasksClasses(Job job, boolean setMapper, boolean setCombiner, boolean setReducer,
        boolean outputCompression) {
    if (setMapper) {
        job.setMapperClass(HadoopWordCount2Mapper.class);
        job.setInputFormatClass(TextInputFormat.class);
    }

    if (setCombiner)
        job.setCombinerClass(HadoopWordCount2Combiner.class);

    if (setReducer) {
        job.setReducerClass(HadoopWordCount2Reducer.class);
        job.setOutputFormatClass(TextOutputFormat.class);
    }

    if (outputCompression) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        SequenceFileOutputFormat.setCompressOutput(job, true);

        job.getConfiguration().set(FileOutputFormat.COMPRESS_CODEC, SnappyCodec.class.getName());
    }
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF triples/*from w w  w.ja va2  s .  c o  m*/
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Triples Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(TripleGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(TripleWritable.class);
    job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(TriplesInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Triples Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF quads//  w ww.  j a v  a  2s.  c om
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Quads Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(QuadGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(QuadWritable.class);
    job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(QuadsInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Quads Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

/**
 * Gets a sequence of jobs that can be used to compute characteristic sets
 * for RDF triple and/or quad inputs/* w  w  w .  j a va 2  s  .c  o  m*/
 * 
 * @param config
 *            Configuration
 * @param inputPaths
 *            Input paths
 * @param intermediateOutputPath
 *            Intermediate output path
 * @param outputPath
 *            Final output path
 * @return Sequence of jobs
 * @throws IOException
 */
public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths,
        String intermediateOutputPath, String outputPath) throws IOException {
    Job[] jobs = new Job[2];

    Job job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Characteristic Set (Generation)");

    // Map/Reduce classes
    job.setMapperClass(QuadGroupBySubjectMapper.class);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(QuadWritable.class);
    job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Input and Output
    job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    jobs[0] = job;

    job = Job.getInstance(config);
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Characteristic Set (Reduction)");

    // Map/Reduce classes
    job.setMapperClass(KeyMapper.class);
    job.setMapOutputKeyClass(CharacteristicSetWritable.class);
    job.setMapOutputValueClass(CharacteristicSetWritable.class);
    job.setReducerClass(CharacteristicSetReducer.class);
    job.setOutputKeyClass(CharacteristicSetWritable.class);
    job.setOutputValueClass(CharacteristicSetWritable.class);

    // Input and Output
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, intermediateOutputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    jobs[1] = job;
    return jobs;
}