List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass
public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass)
From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java
License:Apache License
/** * Gets a sequence of jobs that can be used to compute characteristic sets * for RDF quads// w w w. j av a2 s . c om * * @param config * Configuration * @param inputPaths * Input paths * @param intermediateOutputPath * Intermediate output path * @param outputPath * Final output path * @return Sequence of jobs * @throws IOException */ public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, String outputPath) throws IOException { Job[] jobs = new Job[2]; Job job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Quads Characteristic Set (Generation)"); // Map/Reduce classes job.setMapperClass(QuadGroupBySubjectMapper.class); job.setMapOutputKeyClass(NodeWritable.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(NullWritable.class); // Input and Output job.setInputFormatClass(QuadsInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); SequenceFileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); jobs[0] = job; job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Quads Characteristic Set (Reduction)"); // Map/Reduce classes job.setMapperClass(KeyMapper.class); job.setMapOutputKeyClass(CharacteristicSetWritable.class); job.setMapOutputValueClass(CharacteristicSetWritable.class); job.setReducerClass(CharacteristicSetReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(CharacteristicSetWritable.class); // Input and Output job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, intermediateOutputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); jobs[1] = job; return jobs; }
From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java
License:Apache License
/** * Gets a sequence of jobs that can be used to compute characteristic sets * for RDF triple and/or quad inputs//from www.j a va 2 s. c o m * * @param config * Configuration * @param inputPaths * Input paths * @param intermediateOutputPath * Intermediate output path * @param outputPath * Final output path * @return Sequence of jobs * @throws IOException */ public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, String outputPath) throws IOException { Job[] jobs = new Job[2]; Job job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Characteristic Set (Generation)"); // Map/Reduce classes job.setMapperClass(QuadGroupBySubjectMapper.class); job.setMapOutputKeyClass(NodeWritable.class); job.setMapOutputValueClass(QuadWritable.class); job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(NullWritable.class); // Input and Output job.setInputFormatClass(TriplesOrQuadsInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); SequenceFileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); jobs[0] = job; job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Characteristic Set (Reduction)"); // Map/Reduce classes job.setMapperClass(KeyMapper.class); job.setMapOutputKeyClass(CharacteristicSetWritable.class); job.setMapOutputValueClass(CharacteristicSetWritable.class); job.setReducerClass(CharacteristicSetReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(CharacteristicSetWritable.class); // Input and Output job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, intermediateOutputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); jobs[1] = job; return jobs; }
From source file:org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();/*from w ww . j a v a 2s . c o m*/ addOutputOption(); addOption("recommenderClassName", "r", "Name of recommender class to instantiate"); addOption("numRecommendations", "n", "Number of recommendations per user", "10"); addOption("usersFile", "u", "Number of recommendations per user", null); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path inputFile = getInputPath(); Path outputPath = getOutputPath(); Path usersFile = parsedArgs.get("--usersFile") == null ? inputFile : new Path(parsedArgs.get("--usersFile")); String recommendClassName = parsedArgs.get("--recommenderClassName"); int recommendationsPerUser = Integer.parseInt(parsedArgs.get("--numRecommendations")); Job job = prepareJob(usersFile, outputPath, TextInputFormat.class, UserIDsMapper.class, VarLongWritable.class, NullWritable.class, RecommenderReducer.class, VarLongWritable.class, RecommendedItemsWritable.class, TextOutputFormat.class); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); Configuration jobConf = job.getConfiguration(); jobConf.set(RecommenderReducer.RECOMMENDER_CLASS_NAME, recommendClassName); jobConf.setInt(RecommenderReducer.RECOMMENDATIONS_PER_USER, recommendationsPerUser); jobConf.set(RecommenderReducer.DATA_MODEL_FILE, inputFile.toString()); job.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.cf.taste.hadoop.slopeone.SlopeOneAverageDiffsJob.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();//from w w w. java 2 s.c om addOutputOption(); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path prefsFile = getInputPath(); Path outputPath = getOutputPath(); Path averagesOutputPath = new Path(parsedArgs.get("--tempDir")); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job prefsToDiffsJob = prepareJob(prefsFile, averagesOutputPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, EntityPrefWritable.class, SlopeOnePrefsToDiffsReducer.class, EntityEntityWritable.class, FloatWritable.class, SequenceFileOutputFormat.class); prefsToDiffsJob.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job diffsToAveragesJob = prepareJob(averagesOutputPath, outputPath, SequenceFileInputFormat.class, Mapper.class, EntityEntityWritable.class, FloatWritable.class, SlopeOneDiffsToAveragesReducer.class, EntityEntityWritable.class, FloatWritable.class, TextOutputFormat.class); FileOutputFormat.setOutputCompressorClass(diffsToAveragesJob, GzipCodec.class); diffsToAveragesJob.waitForCompletion(true); } return 0; }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.BBtJob.java
License:Apache License
public static void run(Configuration conf, Path btPath, Path outputPath, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf); job.setJobName("BBt-job"); job.setJarByClass(BBtJob.class); // input/*from w w w.j a v a2s .c o m*/ job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, btPath); // map job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setMapperClass(BBtMapper.class); job.setReducerClass(BBtReducer.class); // combiner and reducer job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); // output job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BBT); // run job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("BBt job failed."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath, int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast, Class<? extends Writable> labelClass, boolean outputBBtProducts) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, labelClass, VectorWritable.class); if (outputBBtProducts) { MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); /*/*from w w w . j a v a 2s. c o m*/ * MAHOUT-1067: if we are asked to output BBT products then named vector * names should be propagated to Q too so that UJob could pick them up * from there. */ oldApiJob.setBoolean(PROP_NV, true); } if (xiPath != null) { // compute pca -related stuff as well MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } /* * HACK: we use old api multiple outputs since they are not available in the * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we * can use new api interfaces. */ Job job = new Job(oldApiJob); job.setJobName("Bt-job"); job.setJarByClass(BtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathA); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); // WARN: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); job.getConfiguration().setInt(QJob.PROP_K, k); job.getConfiguration().setInt(QJob.PROP_P, p); job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString()); job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight); job.setNumReduceTasks(numReduceTasks); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); } /* * we can broadhast Rhat files since all of them are reuqired by each job, * but not Q files which correspond to splits of A (so each split of A will * require only particular Q file, each time different one). */ if (broadcast) { job.getConfiguration().set(PROP_RHAT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf); FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*")); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("Bt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.QJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPaths, Path sbPath, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, long seed, int numReduceTasks) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_QHAT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class, DenseBlockWritable.class); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_RHAT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, SplitPartitionedWritable.class, VectorWritable.class); Job job = new Job(oldApiJob); job.setJobName("Q-job"); job.setJarByClass(QJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); }/* w w w . j a v a 2s. c o m*/ FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(QMapper.class); job.getConfiguration().setInt(PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setLong(PROP_OMEGA_SEED, seed); job.getConfiguration().setInt(PROP_K, k); job.getConfiguration().setInt(PROP_P, p); if (sbPath != null) { job.getConfiguration().set(PROP_SB_PATH, sbPath.toString()); } /* * number of reduce tasks doesn't matter. we don't actually send anything to * reducers. */ job.setNumReduceTasks(0 /* numReduceTasks */); job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("Q job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.UJob.java
License:Apache License
public void run(Configuration conf, Path inputPathQ, Path inputUHatPath, Path sigmaPath, Path outputPath, int k, int numReduceTasks, Class<? extends Writable> labelClass, SSVDSolver.OutputScalingEnum outputScaling) throws ClassNotFoundException, InterruptedException, IOException { job = new Job(conf); job.setJobName("U-job"); job.setJarByClass(UJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathQ); FileOutputFormat.setOutputPath(job, outputPath); // WARN: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_U); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapperClass(UMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(labelClass);//w w w . j a v a 2 s .c om job.setOutputValueClass(VectorWritable.class); job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString()); job.getConfiguration().set(PROP_SIGMA_PATH, sigmaPath.toString()); job.getConfiguration().set(PROP_OUTPUT_SCALING, outputScaling.name()); job.getConfiguration().setInt(PROP_K, k); job.setNumReduceTasks(0); job.submit(); }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.VJob.java
License:Apache License
/** * /*from ww w .j a va2s. c o m*/ * @param conf * @param inputPathBt * @param xiPath * PCA row mean (MAHOUT-817, to fix B') * @param sqPath * sq (MAHOUT-817, to fix B') * @param inputUHatPath * @param inputSigmaPath * @param outputPath * @param k * @param numReduceTasks * @param outputScaling output scaling: apply Sigma, or Sigma^0.5, or none * @throws ClassNotFoundException * @throws InterruptedException * @throws IOException */ public void run(Configuration conf, Path inputPathBt, Path xiPath, Path sqPath, Path inputUHatPath, Path inputSigmaPath, Path outputPath, int k, int numReduceTasks, SSVDSolver.OutputScalingEnum outputScaling) throws ClassNotFoundException, InterruptedException, IOException { job = new Job(conf); job.setJobName("V-job"); job.setJarByClass(VJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathBt); FileOutputFormat.setOutputPath(job, outputPath); // Warn: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_V); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(VMapper.class); job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString()); job.getConfiguration().set(PROP_SIGMA_PATH, inputSigmaPath.toString()); job.getConfiguration().set(PROP_OUTPUT_SCALING, outputScaling.name()); job.getConfiguration().setInt(PROP_K, k); job.setNumReduceTasks(0); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString()); } job.submit(); }
From source file:org.apache.pig.builtin.PigStorage.java
License:Apache License
@Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, ""); FileOutputFormat.setOutputPath(job, new Path(location)); if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) { FileOutputFormat.setCompressOutput(job, true); String codec = job.getConfiguration().get("output.compression.codec"); try {/*from ww w .j a v a2s . com*/ FileOutputFormat.setOutputCompressorClass(job, (Class<? extends CompressionCodec>) Class.forName(codec)); } catch (ClassNotFoundException e) { throw new RuntimeException("Class not found: " + codec); } } else { // This makes it so that storing to a directory ending with ".gz" or ".bz2" works. setCompression(new Path(location), job); } }