List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat
public void setOutputFormat(Class<? extends OutputFormat> theClass)
From source file:de.tudarmstadt.lt.n2n.hadoop.FilterByCountJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), FilterByCountJob.class); conf.setJobName(FilterByCountJob.class.getSimpleName()); conf.setMapperClass(FilterByCountMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); JobClient.runJob(conf);/*from w w w.ja v a 2 s.c o m*/ return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.FilterByVocabularyJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), FilterByVocabularyJob.class); conf.setJobName(FilterByVocabularyJob.class.getSimpleName()); conf.setMapperClass(FilterByVocabularyMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); String word_list_file = conf.get(SHARED_CONSTANTS.PARAM_WORD_LIST); if (word_list_file == null) throw new MissingArgumentException( "Please specify word list with '-Dnlkg.filterbywordsfile=<path-to-file-in-hdfs>'."); DistributedCache.addFileToClassPath(new Path(word_list_file), conf); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);/*from w w w . ja va2 s.c om*/ return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.FlipJoBims.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(FlipJoBims.class); /* begin necessary for UKP cluster */ conf.setMemoryForMapTask(1000L); // 1 GB /* necessary for UKP cdh3 */ conf.setMemoryForReduceTask(1000L); // 1 GB /* necessary for UKP cdh3 */ FileOutputFormat.setCompressOutput(conf, true); // compress output FileOutputFormat.setOutputCompressorClass(conf, org.apache.hadoop.io.compress.BZip2Codec.class); /* use the bzip2 codec for compression */ conf.setCompressMapOutput(true); // compress mapper output /* end necessary for UKP cluster */ conf.setJobName(FlipJoBims.class.getSimpleName()); args = new GenericOptionsParser(conf, args).getRemainingArgs(); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(FlipJoBims.Map.class); conf.setNumReduceTasks(0);//from w w w . j a v a 2 s.c om // conf.setReducerClass(IdentityReducer.class); conf.setMapOutputKeyClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
From source file:de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob.class); conf.setJobName(GoogleSyntacticsJob.class.getSimpleName()); conf.setMapperClass(GoogleSyntacticsJob3Mapper.class); conf.setReducerClass(GoogleSyntacticsJob3Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(JoBimFormat.class); conf.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }/* w w w.ja va 2s.c o m*/ String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJobDkbd.java
License:Apache License
@Override public void configure(JobConf job) { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); job.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }//w ww . j a v a 2 s.c o m try { String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), job); } catch (IOException e) { e.printStackTrace(); } Text2CASInputFormat.setDocumentTextExtractorClass(job, KeyPlusValueAsDocumentExtractor.class); job.setOutputFormat(NullOutputFormat.class); // ignore the serialized cas and use only the output from the CasConsumer }
From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob2.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob2.class); conf.setJobName(GoogleSyntacticsJob2.class.getSimpleName()); conf.setMapperClass(GoogleSyntacticsJob2Mapper.class); conf.setReducerClass(GoogleSyntacticsJob2Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // conf.setMapOutputKeyClass(Text.class); // conf.setMapOutputValueClass(NullWritable.class); conf.setOutputKeyClass(JoBimFormat.class); conf.setOutputValueClass(IntWritable.class); args = new GenericOptionsParser(conf, args).getRemainingArgs(); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }/*ww w . j a va 2 s. c o m*/ String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob5.java
License:Apache License
@Override public void configure(JobConf job) { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); job.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }//from w w w . j av a 2s .co m try { String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), job); } catch (IOException e) { e.printStackTrace(); } Text2CASInputFormat.setDocumentTextExtractorClass(job, KeyPlusValueAsDocumentExtractor.class); job.setMapperClass(JoBimMapper.class); job.setReducerClass(JoBimReducer.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setMemoryForMapTask(4096); job.setMemoryForReduceTask(4096); job.set("mapred.child.java.opts", "-Xmx4096m"); job.setNumReduceTasks(1); // reset to default }
From source file:de.tudarmstadt.lt.n2n.hadoop.PreparsedJob.java
License:Apache License
@Override public void configure(JobConf job) { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); job.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); }//from w w w.java2s .c o m try { String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), job); } catch (IOException e1) { e1.printStackTrace(); } Text2CASInputFormat.setDocumentTextExtractorClass(job, KeyPlusValueAsDocumentExtractor.class); job.setOutputFormat(NullOutputFormat.class); // ignore the serialized cas and use only the output from the CasConsumer }
From source file:de.tudarmstadt.lt.n2n.hadoop.RelationToOneHoleTransformerJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), RelationToOneHoleTransformerJob.class); conf.setJobName(RelationToOneHoleTransformerJob.class.getSimpleName()); args = new GenericOptionsParser(conf, args).getRemainingArgs(); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(RelationToOneHoleTransformerJob.Map.class); conf.setNumReduceTasks(0);/* w w w . ja v a 2s. c o m*/ // conf.setReducerClass(IdentityReducer.class); // sort or no sort? - that is here the question conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:de.tudarmstadt.lt.n2n.hadoop.RemoveExactDuplicatesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), RemoveExactDuplicatesJob.class); conf.setJobName(RemoveExactDuplicatesJob.class.getSimpleName()); conf.setMapperClass(LineMapper.class); conf.setReducerClass(KeyReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); JobClient.runJob(conf);/*from ww w .j ava 2 s . c om*/ return 0; }