List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:babel.prep.extract.NutchPageExtractor.java
License:Apache License
/** * Configures the extraction job./* w ww .ja v a 2 s . c o m*/ */ protected JobConf createJobConf(String crawlDir) throws IOException { Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR); List<Path> segPaths = allSegmentDirs(segmentsPath); StringBuilder allSegNames = new StringBuilder(); for (int i = 0; i < segPaths.size(); i++) { allSegNames.append(" " + segPaths.get(i).getName()); } String timeStamp = getCurTimeStamp(); JobConf job = new NutchJob(getConf()); job.setJobName("read segments" + allSegNames.toString()); // Specify what info to extract job.setBoolean("segment.reader.co", m_co); job.setBoolean("segment.reader.fe", m_fe); job.setBoolean("segment.reader.ge", m_ge); job.setBoolean("segment.reader.pa", m_pa); job.setBoolean("segment.reader.pd", m_pd); job.setBoolean("segment.reader.pt", m_pt); // Specify the paths to extract from for each segment for (int i = 0; i < segPaths.size(); i++) { if (m_ge) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME)); if (m_fe) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME)); if (m_pa) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME)); if (m_co) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME)); if (m_pd) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME)); if (m_pt) FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME)); } // Specify the segments directory so that mapper can recover segment info job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName()); // Store the start time/date of this job job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(PageExtMapper.class); job.setReducerClass(PageExtReducer.class); job.setMapOutputValueClass(NutchChunk.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); return job; }
From source file:babel.prep.langid.LangIdentifier.java
License:Apache License
/** * Configures a map-only language id job. *//*from w w w .j ava 2s .c o m*/ protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("identify languages for pages in " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LangIdMapper.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langid." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); job.set(JOB_PROP_JOB_REFERRER, referrer); return job; }
From source file:babel.prep.langidtime.LangAndTimeExtractor.java
License:Apache License
/** * Configures a map-only language id job. *//* ww w . ja v a 2 s. c o m*/ protected JobConf createJobConf(String crawlDir, String pagesSubDir, String referrer) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("identify languages and collect time for pages in " + pagesSubDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LangAndTimeMapper.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Page.class); //ANNI EDIT job.setNumMapTasks(2); job.setNumReduceTasks(2); //END ANNI EDIT FileInputFormat.addInputPath(job, new Path(crawlDir, pagesSubDir)); Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.langidtime." + getCurTimeStamp()); m_fs.delete(outDir, true); FileOutputFormat.setOutputPath(job, outDir); setUniqueTempDir(job); job.set(JOB_PROP_JOB_REFERRER, referrer); return job; }
From source file:babel.prep.PrepStep.java
License:Apache License
protected void setUniqueTempDir(JobConf job) { Path tempDir = new Path( getConf().get("hadoop.tmp.dir", ".") + "/" + java.util.UUID.randomUUID().toString()); job.set("hadoop.tmp.dir", tempDir.toString()); }
From source file:Brush.BrushConfig.java
License:Apache License
public static void initializeConfiguration(JobConf conf) { validateConfiguration();//from w w w. j av a 2 s . com conf.setNumMapTasks(HADOOP_MAPPERS); conf.setNumReduceTasks(HADOOP_REDUCERS); conf.set("mapred.child.java.opts", HADOOP_JAVAOPTS); conf.set("mapred.task.timeout", Long.toString(HADOOP_TIMEOUT)); conf.setLong("LOCALNODES", HADOOP_LOCALNODES); conf.setLong("UP_KMER", UP_KMER); conf.setLong("LOW_KMER", LOW_KMER); conf.setLong("K", K); //conf.setFloat("ERRORRATE", ERRORRATE); conf.setFloat("MAJORITY", MAJORITY); conf.setFloat("PWM_N", PWM_N); conf.setFloat("EXPCOV", EXPCOV); conf.setFloat("KMERCOV", KMERCOV); conf.setLong("READLENGTH", READLEN); conf.setLong("TIPLENGTH", TIPLENGTH); conf.setLong("INSLENGTH", INSLEN); conf.setLong("INSLENGTH_SD", INSLEN_SD); conf.setLong("MAXBUBBLELEN", MAXBUBBLELEN); conf.setFloat("BUBBLEEDITRATE", BUBBLEEDITRATE); conf.setFloat("LOW_COV_THRESH", LOW_COV_THRESH); conf.setLong("MAX_LOW_COV_LEN", MAX_LOW_COV_LEN); //conf.setFloat("ERRORRATE", ERRORRATE); conf.setLong("N50_TARGET", N50_TARGET); }
From source file:Business.DataJoin.java
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, DataJoin.class); final File f = new File(MapReduceOne.class.getProtectionDomain().getCodeSource().getLocation().getPath()); String inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/inFiles/"; String outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outFiles/OutputOne"; //use the arguments instead if provided. if (args.length > 1) { inFiles = args[1];//from w w w . j a v a 2 s . c o m outFiles = args[2]; } Path in = new Path(inFiles); Path out = new Path(outFiles); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("Data Join"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TaggedWritable.class); job.set("mapred.textoutputformat.separator", ","); JobClient.runJob(job); return 0; }
From source file:ca.etsmtl.lasi.hbasewikipedialoader.HBaseWikipediaLoader.java
License:Apache License
/** * Sets up the actual job./* www . ja v a2 s. c om*/ * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static JobConf createSubmittableJob(HBaseConfiguration conf, String[] args) throws IOException { JobConf jobConf = new JobConf(conf, HBaseWikipediaLoader.class); jobConf.setJobName(NAME); // Stream stuff jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader"); jobConf.set("stream.recordreader.begin", "<page>"); jobConf.set("stream.recordreader.end", "</page>"); jobConf.setSpeculativeExecution(false); jobConf.setMapOutputKeyClass(ImmutableBytesWritable.class); jobConf.setMapOutputValueClass(BatchUpdate.class); jobConf.setMapperClass(Map.class); jobConf.setNumReduceTasks(0); jobConf.setInputFormat(StreamInputFormat.class); jobConf.setOutputFormat(TableOutputFormat.class); jobConf.set(TableOutputFormat.OUTPUT_TABLE, TABLE); jobConf.setOutputKeyClass(ImmutableBytesWritable.class); jobConf.setOutputValueClass(BatchUpdate.class); StreamInputFormat.setInputPaths(jobConf, new Path(args[0])); FileOutputFormat.setOutputPath(jobConf, new Path("/tmp/" + NAME + "-" + System.currentTimeMillis())); return jobConf; }
From source file:cascading.avro.AvroScheme.java
License:Apache License
/** * sinkConfInit is called by cascading to set up the sinks. This happens on the client side before the * job is distributed./*from www. ja v a 2 s . c om*/ * There is a check for the presence of a schema and an exception is thrown if none has been provided. * After the schema check the conf object is given the options that Avro needs. * * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically. * @param tap The cascading Tap object. Should be passed in by cascading automatically. * @param conf The Hadoop JobConf object. This is passed in by cascading automatically. * @throws RuntimeException If no schema is present this halts the entire process. */ @Override public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { if (schema == null) { throw new RuntimeException("Must provide sink schema"); } // Set the output schema and output format class conf.set(AvroJob.OUTPUT_SCHEMA, schema.toString()); conf.setOutputFormat(AvroOutputFormat.class); // add AvroSerialization to io.serializations addAvroSerializations(conf); }
From source file:cascading.avro.AvroScheme.java
License:Apache License
/** * sourceConfInit is called by cascading to set up the sources. This happens on the client side before the * job is distributed.// ww w .j a v a 2 s .c om * There is a check for the presence of a schema and if none has been provided the data is peeked at to get a schema. * After the schema check the conf object is given the options that Avro needs. * * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically. * @param tap The cascading Tap object. Should be passed in by cascading automatically. * @param conf The Hadoop JobConf object. This is passed in by cascading automatically. * @throws RuntimeException If no schema is present this halts the entire process. */ @Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { retrieveSourceFields(flowProcess, tap); // Set the input schema and input class conf.set(AvroJob.INPUT_SCHEMA, schema.toString()); conf.setInputFormat(AvroInputFormat.class); // add AvroSerialization to io.serializations addAvroSerializations(conf); }
From source file:cascading.flow.FlowStep.java
License:Open Source License
protected JobConf getJobConf(JobConf parentConf) throws IOException { JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf); // set values first so they can't break things downstream if (hasProperties()) { for (Map.Entry entry : getProperties().entrySet()) conf.set(entry.getKey().toString(), entry.getValue().toString()); }// ww w. ja va 2s .co m // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepName()); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapperClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(conf); initFromSink(conf); initFromTraps(conf); if (sink.getScheme().getNumSinkParts() != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(sink.getScheme().getNumSinkParts()); else conf.setNumMapTasks(sink.getScheme().getNumSinkParts()); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks conf.setInt("cascading.flow.step.id", id); conf.set("cascading.flow.step", Util.serializeBase64(this)); return conf; }