List of usage examples for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType
public static void setOutputCompressionType(Job job, CompressionType style)
From source file:ivory.core.preprocess.BuildIntDocVectors.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); LOG.info("PowerTool: " + BuildIntDocVectors.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); String termsFile = env.getIndexTermsData(); String termIDsFile = env.getIndexTermIdsData(); String idToTermFile = env.getIndexTermIdMappingData(); Path termsFilePath = new Path(termsFile); Path termIDsFilePath = new Path(termIDsFile); if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) { LOG.error("Error, terms files don't exist!"); return 0; }/*from w ww . j a va 2s . com*/ Path outputPath = new Path(env.getIntDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("IntDocVectors already exist: skipping!"); return 0; } DistributedCache.addCacheFile(new URI(termsFile), conf); DistributedCache.addCacheFile(new URI(termIDsFile), conf); DistributedCache.addCacheFile(new URI(idToTermFile), conf); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job = new Job(conf, BuildIntDocVectors.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildIntDocVectors.class); job.setNumReduceTasks(0); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, env.getTermDocVectorsDirectory()); FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(LazyIntDocVector.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LazyIntDocVector.class); job.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.preprocess.BuildTermDocVectors.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); int numReducers = conf.getInt(Constants.TermDocVectorSegments, 0); LOG.info("PowerTool: " + BuildTermDocVectors.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, numReducers)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + " doesn't exist!"); return 0; }/*from w w w .ja v a 2s. c o m*/ DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job1 = new Job(conf, BuildTermDocVectors.class.getSimpleName() + ":" + collectionName); job1.setJarByClass(BuildTermDocVectors.class); job1.setNumReduceTasks(numReducers); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Write out number of postings. int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable:" + collectionName); job2.setJarByClass(BuildTermDocVectors.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:ivory.preprocess.BuildIntDocVectors2.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); LOG.info("PowerTool: BuildIntDocVectors2"); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); String termsFile = env.getIndexTermsData(); String termIDsFile = env.getIndexTermIdsData(); String idToTermFile = env.getIndexTermIdMappingData(); Path termsFilePath = new Path(termsFile); Path termIDsFilePath = new Path(termIDsFile); if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) { LOG.error("Error, terms files don't exist!"); return 0; }//from w ww. j a va2 s . com Path outputPath = new Path(env.getIntDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("IntDocVectors already exist: skipping!"); return 0; } DistributedCache.addCacheFile(new URI(termsFile), conf); DistributedCache.addCacheFile(new URI(termIDsFile), conf); DistributedCache.addCacheFile(new URI(idToTermFile), conf); conf.set("mapred.child.java.opts", "-Xmx2048m"); //conf.set("mapreduce.map.java.opts", "-Xmx2048m"); Job job = new Job(conf, "BuildIntDocVectors2:" + collectionName); job.setJarByClass(BuildIntDocVectors2.class); job.setNumReduceTasks(0); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, env.getTermDocVectorsDirectory()); FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(LazyIntDocVector.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LazyIntDocVector.class); job.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.preprocess.BuildTermDocVectors2.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); String collectionPath = conf.get(Constants.CollectionPath); String inputFormat = conf.get(Constants.InputFormat); String tokenizer = conf.get(Constants.Tokenizer); String mappingClass = conf.get(Constants.DocnoMappingClass); int docnoOffset = conf.getInt(Constants.DocnoOffset, 0); LOG.info("PowerTool: BuildTermDocVectors2"); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath)); LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat)); LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer)); LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass)); LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!"); return 0; }//ww w . ja v a 2s. c om DistributedCache.addCacheFile(mappingFile.toUri(), conf); Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { LOG.info("TermDocVectors already exist: Skipping!"); return 0; } env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); env.writeDocnoOffset(docnoOffset); Job job1 = new Job(conf, "BuildTermDocVectors2:" + collectionName); job1.setJarByClass(BuildTermDocVectors2.class); job1.setNumReduceTasks(0); FileInputFormat.addInputPaths(job1, collectionPath); FileOutputFormat.setOutputPath(job1, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD); job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(LazyTermDocVector.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(LazyTermDocVector.class); job1.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); job1.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // write out number of postings int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue(); env.writeCollectionDocumentCount(collectionDocCount); Path dlFile = env.getDoclengthsData(); if (fs.exists(dlFile)) { LOG.info("DocLength data exists: Skipping!"); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCount); conf.set(InputPath, env.getDoclengthsDirectory().toString()); conf.set(DocLengthDataFile, dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); LOG.info("Writing doc length data to " + dlFile + "..."); Job job2 = new Job(conf, "DocLengthTable2:" + collectionName); job2.setJarByClass(BuildTermDocVectors2.class); job2.setNumReduceTasks(0); job2.setInputFormatClass(NullInputFormat.class); job2.setOutputFormatClass(NullOutputFormat.class); job2.setMapperClass(DocLengthDataWriterMapper.class); startTime = System.currentTimeMillis(); job2.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); return 0; }
From source file:nthu.scopelab.tsqr.ssvd.VJob.java
License:Apache License
public void start(Configuration conf, Path inputPathBt, Path inputUHatPath, Path inputSigmaPath, Path outputPath, int k, int numReduceTasks, int subRowSize, boolean vHalfSigma, int mis) throws ClassNotFoundException, InterruptedException, IOException { job = new Job(conf); job.setJobName("V-job"); job.setJarByClass(VJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathBt); FileSystem fs = FileSystem.get(job.getConfiguration()); fileGather fgather = new fileGather(inputPathBt, "", fs); mis = Checker.checkMis(mis, fgather.getInputSize(), fs); FileInputFormat.setMaxInputSplitSize(job, mis * 1024 * 1024); FileOutputFormat.setOutputPath(job, outputPath); // Warn: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_V); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(LMatrixWritable.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(LMatrixWritable.class); job.setMapperClass(VMapper.class); job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString()); job.getConfiguration().set(PROP_SIGMA_PATH, inputSigmaPath.toString()); if (vHalfSigma) { job.getConfiguration().set(PROP_V_HALFSIGMA, "y"); }/*from w w w . ja va 2 s . c om*/ job.getConfiguration().setInt(QJob.PROP_K, k); job.getConfiguration().setInt(SUB_ROW_SIZE, subRowSize); job.setNumReduceTasks(0); job.submit(); //job.waitForCompletion(true); }
From source file:org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester.java
License:Apache License
private int runPartitionerJob() throws Exception { Job partitionerJob = new Job(getConf(), "Partition Wikipedia"); Configuration partitionerConf = partitionerJob.getConfiguration(); partitionerConf.set("mapred.map.tasks.speculative.execution", "false"); configurePartitionerJob(partitionerJob); List<Path> inputPaths = new ArrayList<>(); SortedSet<String> languages = new TreeSet<>(); FileSystem fs = FileSystem.get(partitionerConf); Path parent = new Path(partitionerConf.get("wikipedia.input")); listFiles(parent, fs, inputPaths, languages); System.out.println("Input files in " + parent + ":" + inputPaths.size()); Path[] inputPathsArray = new Path[inputPaths.size()]; inputPaths.toArray(inputPathsArray); System.out.println("Languages:" + languages.size()); // setup input format WikipediaInputFormat.setInputPaths(partitionerJob, inputPathsArray); partitionerJob.setMapperClass(WikipediaPartitioner.class); partitionerJob.setNumReduceTasks(0); // setup output format partitionerJob.setMapOutputKeyClass(Text.class); partitionerJob.setMapOutputValueClass(Article.class); partitionerJob.setOutputKeyClass(Text.class); partitionerJob.setOutputValueClass(Article.class); partitionerJob.setOutputFormatClass(SequenceFileOutputFormat.class); Path outputDir = WikipediaConfiguration.getPartitionedArticlesPath(partitionerConf); SequenceFileOutputFormat.setOutputPath(partitionerJob, outputDir); SequenceFileOutputFormat.setCompressOutput(partitionerJob, true); SequenceFileOutputFormat.setOutputCompressionType(partitionerJob, CompressionType.RECORD); return partitionerJob.waitForCompletion(true) ? 0 : 1; }
From source file:org.apache.ignite.internal.processors.hadoop.impl.examples.HadoopWordCount2.java
License:Apache License
/** * Sets task classes with related info if needed into configuration object. * * @param job Configuration to change./*from ww w . jav a 2 s . c o m*/ * @param setMapper Option to set mapper and input format classes. * @param setCombiner Option to set combiner class. * @param setReducer Option to set reducer and output format classes. */ public static void setTasksClasses(Job job, boolean setMapper, boolean setCombiner, boolean setReducer, boolean outputCompression) { if (setMapper) { job.setMapperClass(HadoopWordCount2Mapper.class); job.setInputFormatClass(TextInputFormat.class); } if (setCombiner) job.setCombinerClass(HadoopWordCount2Combiner.class); if (setReducer) { job.setReducerClass(HadoopWordCount2Reducer.class); job.setOutputFormatClass(TextOutputFormat.class); } if (outputCompression) { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setCompressOutput(job, true); job.getConfiguration().set(FileOutputFormat.COMPRESS_CODEC, SnappyCodec.class.getName()); } }
From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java
License:Apache License
/** * Gets a sequence of jobs that can be used to compute characteristic sets * for RDF triples/*from w w w.ja va2 s . c o m*/ * * @param config * Configuration * @param inputPaths * Input paths * @param intermediateOutputPath * Intermediate output path * @param outputPath * Final output path * @return Sequence of jobs * @throws IOException */ public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, String outputPath) throws IOException { Job[] jobs = new Job[2]; Job job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Triples Characteristic Set (Generation)"); // Map/Reduce classes job.setMapperClass(TripleGroupBySubjectMapper.class); job.setMapOutputKeyClass(NodeWritable.class); job.setMapOutputValueClass(TripleWritable.class); job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(NullWritable.class); // Input and Output job.setInputFormatClass(TriplesInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); SequenceFileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); jobs[0] = job; job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Triples Characteristic Set (Reduction)"); // Map/Reduce classes job.setMapperClass(KeyMapper.class); job.setMapOutputKeyClass(CharacteristicSetWritable.class); job.setMapOutputValueClass(CharacteristicSetWritable.class); job.setReducerClass(CharacteristicSetReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(CharacteristicSetWritable.class); // Input and Output job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, intermediateOutputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); jobs[1] = job; return jobs; }
From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java
License:Apache License
/** * Gets a sequence of jobs that can be used to compute characteristic sets * for RDF quads// w ww. j a v a 2s. c om * * @param config * Configuration * @param inputPaths * Input paths * @param intermediateOutputPath * Intermediate output path * @param outputPath * Final output path * @return Sequence of jobs * @throws IOException */ public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, String outputPath) throws IOException { Job[] jobs = new Job[2]; Job job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Quads Characteristic Set (Generation)"); // Map/Reduce classes job.setMapperClass(QuadGroupBySubjectMapper.class); job.setMapOutputKeyClass(NodeWritable.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(NullWritable.class); // Input and Output job.setInputFormatClass(QuadsInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); SequenceFileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); jobs[0] = job; job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Quads Characteristic Set (Reduction)"); // Map/Reduce classes job.setMapperClass(KeyMapper.class); job.setMapOutputKeyClass(CharacteristicSetWritable.class); job.setMapOutputValueClass(CharacteristicSetWritable.class); job.setReducerClass(CharacteristicSetReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(CharacteristicSetWritable.class); // Input and Output job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, intermediateOutputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); jobs[1] = job; return jobs; }
From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java
License:Apache License
/** * Gets a sequence of jobs that can be used to compute characteristic sets * for RDF triple and/or quad inputs/* w w w . j a va 2 s .c o m*/ * * @param config * Configuration * @param inputPaths * Input paths * @param intermediateOutputPath * Intermediate output path * @param outputPath * Final output path * @return Sequence of jobs * @throws IOException */ public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, String outputPath) throws IOException { Job[] jobs = new Job[2]; Job job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Characteristic Set (Generation)"); // Map/Reduce classes job.setMapperClass(QuadGroupBySubjectMapper.class); job.setMapOutputKeyClass(NodeWritable.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(NullWritable.class); // Input and Output job.setInputFormatClass(TriplesOrQuadsInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); SequenceFileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); jobs[0] = job; job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Characteristic Set (Reduction)"); // Map/Reduce classes job.setMapperClass(KeyMapper.class); job.setMapOutputKeyClass(CharacteristicSetWritable.class); job.setMapOutputValueClass(CharacteristicSetWritable.class); job.setReducerClass(CharacteristicSetReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(CharacteristicSetWritable.class); // Input and Output job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, intermediateOutputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); jobs[1] = job; return jobs; }