List of usage examples for org.apache.hadoop.mapred FileOutputFormat setCompressOutput
public static void setCompressOutput(JobConf conf, boolean compress)
From source file:org.apache.avro.mapred.TestWordCount.java
License:Apache License
@SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile();/*from w ww.j a v a 2 s . co m*/ job.setJobName("wordcount"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(); }
From source file:org.apache.avro.mapred.TestWordCountGeneric.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); JobConf job = new JobConf(); try {//from w w w . jav a 2 s . co m WordCountUtil.writeLinesFile(); job.setJobName("wordcount"); AvroJob.setInputGeneric(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputGeneric(job, WordCount.SCHEMA$); job.setMapperClass(MapImpl.class); job.setCombinerClass(ReduceImpl.class); job.setReducerClass(ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); JobClient.runJob(job); WordCountUtil.validateCountsFile(); } finally { outputPath.getFileSystem(job).delete(outputPath); } }
From source file:org.apache.avro.mapred.TestWordCountSpecific.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); try {/* w w w.j a v a 2 s .c om*/ WordCountUtil.writeLinesFile(); job.setJobName("wordcount"); AvroJob.setInputSpecific(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSpecific(job, WordCount.SCHEMA$); job.setMapperClass(MapImpl.class); job.setCombinerClass(ReduceImpl.class); job.setReducerClass(ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); JobClient.runJob(job); WordCountUtil.validateCountsFile(); } finally { outputPath.getFileSystem(job).delete(outputPath); } }
From source file:org.apache.avro.mapred.tether.TetherOutputFormat.java
License:Apache License
/** Enable output compression using the deflate codec and specify its level.*/ public static void setDeflateLevel(JobConf job, int level) { FileOutputFormat.setCompressOutput(job, true); job.setInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, level); }
From source file:org.apache.trevni.avro.TestWordCount.java
License:Apache License
public void testOutputFormat() throws Exception { JobConf job = new JobConf(); WordCountUtil wordCountUtil = new WordCountUtil("trevniMapredTest"); wordCountUtil.writeLinesFile();/*www. j ava 2s .c om*/ AvroJob.setInputSchema(job, STRING); AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING, LONG)); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in")); FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out")); FileOutputFormat.setCompressOutput(job, true); job.setOutputFormat(AvroTrevniOutputFormat.class); JobClient.runJob(job); wordCountUtil.validateCountsFile(); }
From source file:org.archive.wayback.hadoop.CDXSort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.//from ww w . ja va 2 s . c o m * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { boolean compressOutput = false; boolean dereferenceInputs = false; boolean canonicalize = false; boolean funkyInput = false; JobConf jobConf = new JobConf(getConf(), CDXSort.class); jobConf.setJobName("cdxsort"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("--compress-output".equals(args[i])) { compressOutput = true; } else if ("--funky-input".equals(args[i])) { funkyInput = true; } else if ("--dereference-inputs".equals(args[i])) { dereferenceInputs = true; } else if ("--canonicalize".equals(args[i])) { canonicalize = true; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are exactly 3 parameters left: split input output if (otherArgs.size() != 3) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3."); return printUsage(); } String splitPath = otherArgs.get(0); String inputPath = otherArgs.get(1); String outputPath = otherArgs.get(2); // load the split file, find and set the number of reduces AlphaPartitioner partitioner = new AlphaPartitioner(); File localSplitFile = new File(splitPath); FileInputStream fis = new FileInputStream(localSplitFile); InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8); BufferedReader bis = new BufferedReader(isr); // try { // partitioner.loadBoundaries(bis); // } catch (IOException except) { // System.err.println("ERROR: Problem loading file " + splitPath); // return printUsage(); // exits // } // jobConf.setNumReduceTasks(partitioner.getNumPartitions()); // // // copy the split file into the FS, add to the DistributedCache: //// AlphaPartitioner.setPartitionFile(jobConf, localSplitFile); // AlphaPartitioner.setSplitCache(jobConf, localSplitFile); // System.err.println("uploaded split file to FS and DistributedCache"); // // // Set job configs: // jobConf.setInputFormat(TextInputFormat.class); // // jobConf.setOutputFormat(TextOutputFormat.class); // if (canonicalize) { // jobConf.setMapperClass(CDXCanonicalizerMapClass.class); // } else { // jobConf.setMapperClass(CDXMapClass.class); // } // jobConf.setOutputKeyClass(Text.class); // jobConf.setOutputValueClass(Text.class); // jobConf.set("mapred.textoutputformat.separator", " "); // jobConf.setPartitionerClass(AlphaPartitioner.class); int inputCount = 0; // Set job input: if (dereferenceInputs) { // SO SLOW... can't add one at a time... // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // FileInputFormat.addInputPath(jobConf, new Path(line)); // inputCount++; // System.err.println("Added path(" + inputCount + "): " + line); // } // PASS 2: // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // ArrayList<String> list = new ArrayList<String>(); // // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // list.add(line); // inputCount++; // } // Path arr[] = new Path[list.size()]; // for(int i=0; i < list.size(); i++) { // arr[i] = new Path(list.get(i)); // } // FileInputFormat.setInputPaths(jobConf, arr); // PASS 3: if (funkyInput) { jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class); } else { jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class); } FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } else { FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } // Set job output: FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); if (compressOutput) { FileOutputFormat.setCompressOutput(jobConf, true); FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class); } // System.out.println("Running on " + cluster.getTaskTrackers() // + " nodes, processing " + inputCount + " files/directories" // + " into " + outputPath + " with " // + partitioner.getNumPartitions() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:org.terrier.utility.io.HadoopUtility.java
License:Mozilla Public License
/** Utility method to set JobOutputCompression if possible. * In general, I find that JobOutputCompression fails for * local job trackers, so this code checks the job tracker * location first./*from w w w. j a va2s . c om*/ * @param conf JobConf of job. * @return true if JobOutputCompression was set. */ public static boolean setJobOutputCompression(JobConf conf) { if (!conf.get("mapred.job.tracker").equals("local")) { FileOutputFormat.setCompressOutput(conf, true); FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); return true; } return false; }
From source file:ronchy.BigramCount.java
License:Apache License
/** * Runs this tool./*ww w .java2 s .c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int mapTasks = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); sLogger.info("Tool: BigramCount"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(BigramCount.class); conf.setJobName("BigramCount"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); /** * Note that these must match the Class arguments given in the mapper */ conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(outputDir.toUri(), conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:sa.edu.kaust.indexing.DemoCountTrecDocuments.java
License:Apache License
/** * Runs this tool./*from w w w.j a v a 2 s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String mappingFile = args[2]; sLogger.info("input: " + inputPath); sLogger.info("output dir: " + outputPath); sLogger.info("docno mapping file: " + mappingFile); JobConf conf = new JobConf(DemoCountTrecDocuments.class); conf.setJobName("DemoCountTrecDocuments"); conf.setNumReduceTasks(0); // Pass in the class name as a String; this is makes the mapper general // in being able to load any collection of Indexable objects that has // docid/docno mapping specified by a DocnoMapping object conf.set("DocnoMappingClass", "edu.umd.cloud9.collection.trec.TrecDocnoMapping"); // put the mapping file in the distributed cache so each map worker will // have it //DistributedCache.addCacheFile(new URI(mappingFile), conf); if (conf.get("mapred.job.tracker").equals("local")) { conf.set("DocnoMappingFile", mappingFile); } else { DistributedCache.addCacheFile(new URI(mappingFile), conf); } FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TrecDocumentInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:sa.edu.kaust.twitter.preprocess.spam.RemoveTweetsOfSpamUsers.java
License:Apache License
public static int removeTweetsOfSpamUsers(String inputPath, String outputPath, int numReducers, String spamUserListFile, long startID, long endID, String nTweetsFile, Boolean spam) throws Exception { sLogger.info("input: " + inputPath); sLogger.info("output dir: " + outputPath); sLogger.info("spam user list file: " + spamUserListFile); JobConf conf = new JobConf(RemoveTweetsOfSpamUsers.class); FileSystem fs = FileSystem.get(conf); conf.setJobName("RemoveSpamUserTweets"); conf.setLong("startID", startID); conf.setLong("endID", endID); conf.setNumReduceTasks(numReducers); conf.setBoolean("spam", spam); // put the mapping file in the distributed cache so each map worker will // have it/*w ww .jav a2 s .c o m*/ //DistributedCache.addCacheFile(new URI(mappingFile), conf); if (conf.get("mapred.job.tracker").equals("local")) { conf.set("SpamUserListFile", spamUserListFile); } else { DistributedCache.addCacheFile(new URI(spamUserListFile), conf); } FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TweetWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already //FileSystem.get(conf).delete(new Path(outputPath), true); if (fs.exists(new Path(outputPath))) { sLogger.info("Output already exists: skipping!"); return FSProperty.readInt(fs, nTweetsFile); } RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int nonSpamTweets = (int) counters.findCounter(Statistics.NON_SPAM_TWEETS).getCounter(); FSProperty.writeInt(fs, nTweetsFile, nonSpamTweets); sLogger.info("num of non-spam tweets: " + nonSpamTweets); return nonSpamTweets; }