List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat
public void setOutputFormat(Class<? extends OutputFormat> theClass)
From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.gpu.MatrixMultiplicationGpu.java
License:Apache License
public static Configuration createMatrixMultiplicationGpuConf(Configuration initialConf, Path aPath, Path bPath, Path outPath, int outCardinality, int tileWidth, boolean isDebugging) { JobConf conf = new JobConf(initialConf, MatrixMultiplicationGpu.class); conf.setJobName("MatrixMultiplicationGPU: " + aPath + " x " + bPath + " = " + outPath); conf.setInt(CONF_OUT_CARD, outCardinality); conf.setInt(CONF_TILE_WIDTH, tileWidth); conf.setBoolean(CONF_DEBUG, isDebugging); conf.setInputFormat(CompositeInputFormat.class); conf.set("mapred.join.expr", CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath)); conf.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(conf, outPath); conf.setMapperClass(MatrixMultiplyGpuMapper.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(VectorWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); // Increase client heap size for GPU Rootbeer execution conf.set("mapred.child.java.opts", "-Xms8G -Xmx8G"); // No Reduce step is needed // -> 0 reducer means reduce step will be skipped and // mapper output will be the final out // -> Identity reducer means then shuffling/sorting will still take place conf.setNumReduceTasks(0);//w w w .ja v a 2s .c o m return conf; }
From source file:average.AverageDriver.java
public static void main(String[] args) { JobClient client = new JobClient(); // Configurations for Job set in this variable JobConf conf = new JobConf(average.AverageDriver.class); // Name of the Job conf.setJobName("BookCrossing1.0"); // Data type of Output Key and Value conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); // Setting the Mapper and Reducer Class conf.setMapperClass(average.AverageMapper.class); conf.setReducerClass(average.AverageReducer.class); // Formats of the Data Type of Input and output conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // Specify input and output DIRECTORIES (not files) FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); client.setConf(conf);/*from w w w . j a v a 2s. c om*/ try { // Running the job with Configurations set in the conf. JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } }
From source file:averageprocessingtimesbytype.AverageProcessingTimesByType.java
public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, AverageProcessingTimesByType.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("AverageProcessingTimesByType"); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.set("key.value.separator.in.input.line", ""); JobClient.runJob(job);/* ww w .j a v a2 s . co m*/ return 0; }
From source file:avro.HadoopAvro.java
License:Open Source License
private JobConf createJobConfig() throws IOException { Path inputPath = new Path(INPUT_PATH); Path outputPath = new Path(OUTPUT_PATH); FileSystem.get(new Configuration()).delete(outputPath, true); JobConf jobConfig = new JobConf(); jobConfig.setInputFormat(AvroInputFormat.class); jobConfig.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setOutputPath(jobConfig, outputPath); AvroInputFormat.addInputPath(jobConfig, inputPath); jobConfig.set(AvroJob.OUTPUT_SCHEMA, User.SCHEMA.toString()); jobConfig.set(AvroJob.INPUT_SCHEMA, User.SCHEMA.toString()); return jobConfig; }
From source file:azkaban.jobtype.examples.java.WordCount.java
License:Apache License
public void run() throws Exception { logger.info(String.format("Starting %s", getClass().getSimpleName())); // hadoop conf should be on the classpath JobConf jobconf = getJobConf(); jobconf.setJarByClass(WordCount.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(IntWritable.class); jobconf.setMapperClass(Map.class); jobconf.setReducerClass(Reduce.class); jobconf.setInputFormat(TextInputFormat.class); jobconf.setOutputFormat(TextOutputFormat.class); FileInputFormat.addInputPath(jobconf, new Path(inputPath)); FileOutputFormat.setOutputPath(jobconf, new Path(outputPath)); if (forceOutputOverrite) { FileSystem fs = FileOutputFormat.getOutputPath(jobconf).getFileSystem(jobconf); fs.delete(FileOutputFormat.getOutputPath(jobconf), true); }/*from w w w. java 2s . co m*/ super.run(); }
From source file:babel.prep.corpus.CorpusGenerator.java
License:Apache License
/** * Configures a map-only dataset generation job. *///from w w w .j a va 2s. com protected JobConf createJobConf(String crawlDir, String pagesSubDir, boolean xmlOut) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("create " + (xmlOut ? "xml formatted" : "") + " dataset from " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CorpusGenMapper.class); job.setOutputFormat(xmlOut ? MultipleXMLLangFileOutputFormat.class : MultipleLangFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "corpus." + (xmlOut ? PARAM_XML + "." : "") + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.datedcorpus.DatedCorpusGenerator.java
License:Apache License
/** * Configures a map-only dataset generation job. *///w w w. ja v a2 s .c om protected JobConf createJobConf(String crawlDir, String pagesSubDir) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("create dated dataset from " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(DatedCorpusGenMapper.class); job.setReducerClass(DatedCorpusGenReducer.class); job.setMapOutputValueClass(PageVersion.class); job.setOutputFormat(DatedLangFilesOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, CORPUS_SUBDIR), "datedcorpus." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.extract.NutchPageExtractor.java
License:Apache License
/** * Configures the extraction job.//w w w .j a v a 2 s . c om */ protected JobConf createJobConf(String crawlDir) throws IOException { Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR); List<Path> segPaths = allSegmentDirs(segmentsPath); StringBuilder allSegNames = new StringBuilder(); for (int i = 0; i < segPaths.size(); i++) { allSegNames.append(" " + segPaths.get(i).getName()); } String timeStamp = getCurTimeStamp(); JobConf job = new NutchJob(getConf()); job.setJobName("read segments" + allSegNames.toString()); // Specify what info to extract job.setBoolean("segment.reader.co", m_co); job.setBoolean("segment.reader.fe", m_fe); job.setBoolean("segment.reader.ge", m_ge); job.setBoolean("segment.reader.pa", m_pa); job.setBoolean("segment.reader.pd", m_pd); job.setBoolean("segment.reader.pt", m_pt); // Specify the paths to extract from for each segment for (int i = 0; i < segPaths.size(); i++) { if (m_ge) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME)); if (m_fe) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME)); if (m_pa) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME)); if (m_co) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME)); if (m_pd) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME)); if (m_pt) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME)); } // Specify the segments directory so that mapper can recover segment info job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName()); // Store the start time/date of this job job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(PageExtMapper.class); job.setReducerClass(PageExtReducer.class); job.setMapOutputValueClass(NutchChunk.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.langid.LangIdentifier.java
License:Apache License
/** * Configures a map-only language id job. */// w w w.jav a 2 s.c o m protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("identify languages for pages in " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LangIdMapper.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langid." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); job.set(JOB_PROP_JOB_REFERRER, referrer); return job; }
From source file:babel.prep.langidtime.LangAndTimeExtractor.java
License:Apache License
/** * Configures a map-only language id job. *//*w w w . ja v a 2s .c o m*/ protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("identify languages and collect time for pages in " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LangAndTimeMapper.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); //ANNI EDIT job.setNumMapTasks(2); job.setNumReduceTasks(2); //END ANNI EDIT FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langidtime." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); job.set(JOB_PROP_JOB_REFERRER, referrer); return job; }