List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.ConfigurationHelper.java
License:Apache License
/** * Job configurator//from w w w . j a v a 2 s. co m * * @param job job instance * @param jarByClass class of the jar * @param mapperClass mapper * @param reducerClass reducer * @param commaSeparatedInputFiles input paths * @param outputPath output * @throws IOException I/O exception */ public static void configureJob(Job job, Class<?> jarByClass, Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass, String commaSeparatedInputFiles, String outputPath) throws IOException { job.setJarByClass(jarByClass); job.setJobName(jarByClass.getName()); // mapper job.setMapperClass(mapperClass); // reducer job.setReducerClass(reducerClass); // input-output is warc job.setInputFormatClass(WARCInputFormat.class); // prevent producing empty files LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class); // intermediate data job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(WARCWritable.class); // output data job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); // set output compression to GZip FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase1FullJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); // set from the command line job.setJarByClass(Phase1FullJob.class); job.setJobName(Phase1FullJob.class.getName()); // mapper/*from w ww.ja v a 2 s . c o m*/ job.setMapperClass(MapperClass.class); // we will compress the mapper's output (use fast Snappy compressor) job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true); job.getConfiguration().setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); // reducer job.setReducerClass(SimpleWarcWriterReducer.class); // input-output is warc job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(WARCOutputFormat.class); // mapper output data job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(WARCWritable.class); // set output compression to GZip FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase2ExactMatchDeDuplication.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); //set from the command line job.setJarByClass(Phase2ExactMatchDeDuplication.class); job.setJobName(Phase2ExactMatchDeDuplication.class.getName()); // mapper// ww w . ja va 2 s. com job.setMapperClass(ExactMatchDetectionMapper.class); // we will compress the mapper's output (use fast Snappy compressor) job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true); job.getConfiguration().setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); // reducer job.setReducerClass(UniqueWarcWriterReducer.class); // no combiner, as the output classes in mapper and reducer are different! // input-output is warc job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(WARCOutputFormat.class); // mapper output data job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(WARCWritable.class); // set output compression to GZip FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step1ExtractNearDupInfo.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase3Step1ExtractNearDupInfo.class); job.setJobName(Phase3Step1ExtractNearDupInfo.class.getName()); // mapper//from w w w .j av a2 s.c o m job.setMapperClass(MapperClass.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DocumentInfo.class); // reducer job.setReducerClass(DeDuplicationTextOutputReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(List.class); job.setInputFormatClass(WARCInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, DocumentInfoOutputFormat.class); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase3Step2DistinctDataJob.class); job.setJobName(Phase3Step2DistinctDataJob.class.getName()); //mapper//www.j a va2 s . c o m job.setMapperClass(RemoveRedundantDataMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); //reducer job.setReducerClass(RemoveRedundantDataReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; job.setInputFormatClass(TextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); //i/o paths FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step3NearDupTuplesCreation.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase3Step3NearDupTuplesCreation.class); job.setJobName(Phase3Step3NearDupTuplesCreation.class.getName()); // mapper//w w w. jav a 2s .c om job.setMapperClass(CreateTuplesMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(TreeSet.class); job.setInputFormatClass(TextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setNumReduceTasks(0); //must be added or the mapper wont be called return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase4RemoveDuplicatesUsingReduceSideJoins.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class); job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName()); // paths// w w w.j ava 2s . co m // text files of ids to be deleted String textFilePath = args[0]; // corpus with *.warc.gz String commaSeparatedInputFiles = args[1]; // output String outputPath = args[2]; //second input the look up text file MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class, JoinTextMapper.class); //first input the data set (check comma separated availability) MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class, JoinWARCMapper.class); job.setPartitionerClass(SourceJoiningKeyPartitioner.class); job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(WARCWritable.class); job.setReducerClass(JoinReducer.class); job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.LangLicenseStatistics.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); ConfigurationHelper.configureJob(job, LangLicenseStatistics.class, MapperClass.class, ReducerClass.class, args[0], args[1]);//from ww w.j a v a 2s.c om // intermediate data job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(MapWritable.class); // output data job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:demo.SsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SsJob.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapOutputKeyClass(StockKey.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SsMapper.class); job.setReducerClass(SsReducer.class); job.waitForCompletion(true);//from w w w . java 2s . c o m return 0; }
From source file:diamondmapreduce.DiamondMapReduce.java
License:Apache License
int launchHamond(String[] arguments) throws Exception { //extract diamond, query, reference and output from array String diamond = arguments[0]; String query = arguments[1];/*from w w w. j a v a 2 s. c om*/ String dataBase = arguments[2]; String outPut = arguments[3]; //set Hadoop configuration Job job = Job.getInstance(getConf(), "DIAMOND"); Configuration conf = job.getConfiguration(); SetConf.setHadoopConf(conf); //get user name userName = HadoopUser.getHadoopUser(); //delete all existing DIAMOND files under current Hadoop user DeleteHDFSFiles.deleteAllFiles(userName); //make Hamond directory on HDFS MakeHamondHDFSdir.makedir(conf, userName); //make DIAMOND database on local then copy to HDFS with query and delete local database MakeDB.makeDB(diamond, dataBase); //copy DIAMOND bin, query and local database file to HDFS CopyFromLocal.copyFromLocal(conf, diamond, query, dataBase, userName); //pass query name and database name to mappers conf.set(QUERY, query); conf.set(DATABASE, dataBase + ".dmnd"); String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length); conf.setStrings("DIAMOND-arguments", subArgs); conf.setStrings(OUTPUT, outPut); //add DIAMOND bin and database into distributed cache job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond")); job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd")); //set job input and output paths FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName())); FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out")); //set job driver and mapper job.setJarByClass(DiamondMapReduce.class); job.setMapperClass(DiamondMapper.class); //set job input format into customized multilines format job.setInputFormatClass(CustomNLineFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(0); return job.waitForCompletion(true) ? 0 : 1; }