List of usage examples for org.apache.hadoop.mapred.lib MultipleOutputs addMultiNamedOutput
public static void addMultiNamedOutput(JobConf conf, String namedOutput, Class<? extends OutputFormat> outputFormatClass, Class<?> keyClass, Class<?> valueClass)
From source file:org.acacia.partitioner.java.EdgeDistributor.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { String dir1 = "/user/miyuru/input"; String dir2 = "/user/miyuru/edgedistributed-out"; // //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); }/*from w w w .j a v a2 s .c om*/ //First job scans through the edge list and splits the edges in to separate files based on the partitioned vertex files. JobConf conf = new JobConf(EdgeDistributor.class); conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[0]); conf.set("org.acacia.partitioner.hbase.table", args[1]); conf.set("org.acacia.partitioner.index.contacthost", args[2]); conf.set("vert-count", args[3]); conf.set("initpartition-id", args[4]); conf.set("zero-flag", args[5]); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(FileMapper.class); conf.setReducerClass(FileReducer.class); //conf.setInputFormat(TextInputFormat.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setNumReduceTasks(96); //Need to specify the number of reduce tasks explicitly. Otherwise it creates only one reduce task. FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); MultipleOutputs.addMultiNamedOutput(conf, "partition", TextOutputFormat.class, NullWritable.class, Text.class); Job job = new Job(conf, "EdgeDistributor"); job.waitForCompletion(true); System.out.println("Done job EdgeDistribution"); }
From source file:org.acacia.partitioner.java.EdgelistPartitioner.java
License:Apache License
@SuppressWarnings("unused") public static void main(String[] args) throws IOException { JobConf conf = new JobConf(EdgelistPartitioner.class); if (conf == null) { return;//from w w w . j a v a 2s.c om } String dir1 = "/user/miyuru/merged"; String dir2 = "/user/miyuru/merged-out"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); //only delete dir2 because dir1 is uploaded externally. if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } conf.setInputFormat(WholeFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); WholeFileInputFormat.setInputPaths(conf, new Path(dir1)); SequenceFileOutputFormat.setOutputPath(conf, new Path(dir2)); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(SequenceFileMapper.class); conf.setReducerClass(MultipleOutputsInvertedReducer.class); conf.setOutputFormat(NullOutputFormat.class); conf.setJobName("EdgelistPartitioner"); MultipleOutputs.addMultiNamedOutput(conf, "partition", TextOutputFormat.class, NullWritable.class, Text.class); JobClient.runJob(conf); }
From source file:org.woodley.sentiment.SentimentJob.java
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); conf.set("mapred.textoutputformat.separator", ";"); JobConf job = new JobConf(conf, SentimentJob.class); job.setJobName("SentimentJob"); job.setInputFormat(KeyValueTextInputFormat.class); // MultipleOutputs.addNamedOutput(job, "text", TextOutputFormat.class, // Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(job, "sent", TextOutputFormat.class, Text.class, Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(SentimentMapper.class); job.setReducerClass(SentimentReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); JobClient.runJob(job);/*from w ww .j a v a 2 s . com*/ return 0; }
From source file:tap.core.ReducerBridge.java
License:Apache License
@Override public void configure(JobConf conf) { super.configure(conf); isTextOutput = conf.getOutputFormat() instanceof TextOutputFormat; isProtoOutput = conf.getOutputFormat() instanceof TapfileOutputFormat; if (isProtoOutput) { try {/*from www . j av a 2 s .c om*/ mapOutClass = Class.forName(conf.get(Phase.MAP_OUT_CLASS)); reduceOutClass = Class.forName(conf.get(Phase.REDUCE_OUT_CLASS)); if (mapOutClass != reduceOutClass) { reduceOutKeyChanges = true; String groupBy = conf.get(Phase.GROUP_BY); String sortBy = conf.get(Phase.SORT_BY); reduceOutSchema = ReflectUtils.getSchema(ObjectFactory.newInstance(reduceOutClass)); extractor = ReflectionKeyExtractor.getReflectionKeyExtractorForReduceOutKey(reduceOutSchema, groupBy, sortBy); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } multiOutputPrefix = conf.get(Phase.MULTIPLE_OUTPUT_PREFIX); if (multiOutputPrefix == null) multiOutputPrefix = "out"; MultipleOutputs.addMultiNamedOutput(conf, multiOutputPrefix, conf.getOutputFormat().getClass(), conf.getOutputKeyClass(), conf.getOutputValueClass()); this.multiOutput = new MultipleOutputs(conf); }
From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGenerator.java
License:Open Source License
/** * /* ww w .ja va 2 s. c o m*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { index_conf = ConfigFactory.load(); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } // Decide whether to apply annotations: // Store the properties: conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config."); LOG.info(index_conf.getString("warc.title")); // Reducer count int numReducers = 1; try { numReducers = index_conf.getInt("warc.hadoop.num_reducers"); } catch (NumberFormatException n) { numReducers = 10; } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCDatasetMapper.class); conf.setReducerClass(FrequencyCountingReducer.class); // This can be optionally use to suppress keys: // conf.setOutputFormat(KeylessTextOutputFormat.class); // conf.set( "map.output.key.field.separator", "" ); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOSTS_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FACES_NAME, TextOutputFormat.class, Text.class, Text.class); }
From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqSampleGenerator.java
License:Open Source License
/** * //from w w w .ja v a2s .c o m * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(MDXSeqSampleMapper.class); conf.setReducerClass(ReservoirSamplingReducer.class); conf.setOutputFormat(KeylessTextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); MultipleOutputs.addMultiNamedOutput(conf, GEO_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_SAMPLE_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); KeylessTextOutputFormat.setCompressOutput(conf, true); KeylessTextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); }
From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqStatsGenerator.java
License:Open Source License
/** * //from ww w . ja v a 2 s .co m * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(MDXSeqStatsMapper.class); conf.setReducerClass(FrequencyCountingReducer.class); conf.setOutputFormat(KeylessTextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); KeylessTextOutputFormat.setCompressOutput(conf, true); KeylessTextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); }