List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:com.maxpoint.cascading.avro.AvroScheme.java
License:Open Source License
@Override public void sinkConfInit(FlowProcess<JobConf> process, Tap<JobConf, RecordReader<AvroWrapper<Record>, Writable>, OutputCollector<AvroWrapper<Record>, Writable>> tap, JobConf conf) { conf.set(AvroJob.OUTPUT_SCHEMA, dataSchema.toString()); conf.setOutputFormat(AvroOutputFormat.class); conf.setOutputKeyClass(AvroWrapper.class); // set compression AvroOutputFormat.setDeflateLevel(conf, 6); AvroJob.setOutputCodec(conf, DataFileConstants.DEFLATE_CODEC); AvroOutputFormat.setSyncInterval(conf, 1048576); }
From source file:com.mh2c.WikipediaDumpLoaderDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // arg checks JobConf conf = new JobConf(getClass()); conf.setJobName("WP dump loader"); // Set the mapper class, but skip the reduce phase conf.setMapperClass(WikipediaDumpLoaderMapper.class); conf.setNumReduceTasks(0);//from w ww. j a v a 2 s . com // The object key/value pairs are text conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); // Stream XML into the job conf.setInputFormat(StreamInputFormat.class); StreamInputFormat.addInputPath(conf, new Path(args[0])); // Use the XML record reader, with each page as one record conf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader"); conf.set("stream.recordreader.begin", "<page>"); conf.set("stream.recordreader.end", "</page>"); // Emit sequence files conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:com.ostor.dedup.hadoop.DedupStorHadoopCreateObjectsMapReduce.java
License:Open Source License
public static void main(String[] args) throws Exception { System.out.println("NOTE: Setting up logs from conf file - " + DedupStor.DEFAULT_LOG4J_FILE); PropertyConfigurator.configure(DedupStor.DEFAULT_LOG4J_FILE); JobConf conf = new JobConf(DedupStorHadoopCreateObjectsMapReduce.class); conf.setJobName("dedup-create-objects"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(DedupObjectSegmentWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(DedupStorHadoopCreateObjectsMapper.class); conf.setReducerClass(DedupStorHadoopCreateObjectsReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); Path inputPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH); Path segmentStorPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_SEGMENTS_LOC_SUFFIX); Path objectStorPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_LOC_SUFFIX); Path objectMapPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH); conf.set(DedupStorHadoopUtils.HADOOP_CONF_SEGMENTS_STOR_PATH_KEY, segmentStorPath.toString()); conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_STOR_PATH_KEY, objectStorPath.toString()); conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_TMP_PATH_KEY, objectMapPath.toString()); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, objectStorPath); JobClient.runJob(conf);/*from ww w.j av a2 s. c o m*/ }
From source file:com.ostor.dedup.hadoop.DedupStorHadoopCreateSegmentsMapReduce.java
License:Open Source License
public static void main(String[] args) throws Exception { System.out.println("NOTE: Setting up logs from conf file - " + DedupStor.DEFAULT_LOG4J_FILE); PropertyConfigurator.configure(DedupStor.DEFAULT_LOG4J_FILE); JobConf conf = new JobConf(DedupStorHadoopCreateSegmentsMapReduce.class); conf.setJobName("dedup-create-segments"); conf.setMapOutputKeyClass(DedupHashWritable.class); conf.setMapOutputValueClass(DedupObjectSegmentCompleteWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DedupObjectSegmentWritable.class); conf.setMapperClass(DedupStorHadoopCreateSegmentsMapper.class); conf.setReducerClass(DedupStorHadoopCreateSegmentsReducer.class); conf.setInputFormat(DedupObjectInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); logger.info("Set input dir - " + args[0]); logger.info("Set output dir - " + args[1]); Path inputPath = new Path(args[0]); Path segmentStorPath = new Path(args[1], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_SEGMENTS_LOC_SUFFIX); Path objectMapPath = new Path(args[1], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH); conf.set(DedupStorHadoopUtils.HADOOP_CONF_SEGMENTS_STOR_PATH_KEY, segmentStorPath.toString()); conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_TMP_PATH_KEY, objectMapPath.toString()); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, objectMapPath); JobClient.runJob(conf);/*w ww . jav a 2 s .co m*/ }
From source file:com.pegasus.ResultInfo.java
License:Apache License
protected JobConf configStage1() throws Exception { final JobConf conf = new JobConf(getConf(), ConCmpt.class); conf.set("cur_iter", "" + cur_iter); conf.set("make_symmetric", "" + make_symmetric); conf.setJobName("ConCmpt_Stage1"); conf.setMapperClass(MapStage1.class); conf.setReducerClass(RedStage1.class); FileInputFormat.setInputPaths(conf, edge_path, curbm_path); FileOutputFormat.setOutputPath(conf, tempbm_path); conf.setNumReduceTasks(nreducers);//from w ww. j a v a 2s.c o m conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); return conf; }
From source file:com.pegasus.ResultInfo.java
License:Apache License
protected JobConf configStage2() throws Exception { final JobConf conf = new JobConf(getConf(), ConCmpt.class); conf.set("cur_iter", "" + cur_iter); conf.set("make_symmetric", "" + make_symmetric); conf.setJobName("ConCmpt_Stage2"); conf.setMapperClass(MapStage2.class); conf.setReducerClass(RedStage2.class); conf.setCombinerClass(CombinerStage2.class); FileInputFormat.setInputPaths(conf, tempbm_path); FileOutputFormat.setOutputPath(conf, nextbm_path); conf.setNumReduceTasks(nreducers);/*from w w w . ja v a 2 s .c o m*/ conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); return conf; }
From source file:com.pinterest.hdfsbackup.distcp.DistCp.java
License:Apache License
/** * Driver to copy srcPath to destPath depending on required protocol. * @param args arguments/* ww w .j a v a 2 s. co m*/ */ static void copy(final Configuration conf, final Arguments args) throws IOException { LOG.info("srcPaths=" + args.srcs); LOG.info("destPath=" + args.dst); checkSrcPath(conf, args.srcs); JobConf job = createJobConf(conf); if (args.preservedAttributes != null) { job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes); } if (args.mapredSslConf != null) { job.set("dfs.https.client.keystore.resource", args.mapredSslConf); } //Initialize the mapper try { setup(conf, job, args); JobClient.runJob(job); finalize(conf, job, args.dst, args.preservedAttributes); } finally { //delete tmp fullyDelete(job.get(TMP_DIR_LABEL), job); //delete jobDirectory fullyDelete(job.get(JOB_DIR_LABEL), job); } }
From source file:com.pinterest.hdfsbackup.distcp.DistCp.java
License:Apache License
/** * Initialize DFSCopyFileMapper specific job-configuration. * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments//from w w w. j av a 2 s . c o m */ private static void setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); //set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { //skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst)); //skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("srcCount=" + srcCount); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); }
From source file:com.rapleaf.hank.cascading.DomainBuilderTap.java
License:Apache License
@Override public void sinkInit(JobConf conf) throws IOException { super.sinkInit(conf); // Output Format conf.setOutputFormat(this.outputFormatClass); // Output Committer conf.setOutputCommitter(DomainBuilderOutputCommitter.class); // Set this tap's Domain name locally in the conf if (conf.get(DomainBuilderOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME) != null) { throw new RuntimeException("Trying to set domain name configuration parameter to " + domainName + " but it was previously set to " + conf.get(DomainBuilderOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME)); } else {//from www .jav a 2s . c o m conf.set(DomainBuilderOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME, domainName); } }
From source file:com.rapleaf.hank.hadoop.DomainCompactorProperties.java
License:Apache License
@Override public JobConf setJobConfProperties(JobConf conf, int versionNumber) { super.setJobConfProperties(conf, versionNumber); // Version Number to compact conf.set(DomainBuilderAbstractOutputFormat.createConfParamName(getDomainName(), CONF_PARAM_HANK_VERSION_NUMBER_TO_COMPACT), Integer.toString(versionToCompactNumber)); return conf;//ww w .j av a 2 s . c o m }