List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, long topN, int reduceCnt, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: from table=" + table + " segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }//from w w w . j a v a 2 s. c o m if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(TableTopInputFormat.class);// ? job.setMapperClass(GenerateMark.class);// generate? job.setPartitionerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }/* ww w .j a v a 2 s .c o m*/ if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorMapHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment: " + segment); JobConf job = new NutchJob(getConf()); job.setJarByClass(GeneratorMapHbase.class); job.setJobName("generate: from " + table + " " + (new SimpleDateFormat("yyyyMMdd HH:mm:ss")).format(System.currentTimeMillis())); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (numLists == -1) { numLists = job.getNumMapTasks(); // a partition per fetch task }//w ww . j ava 2s.c om numLists = 4;// TODO if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCENUM, numLists); job.setInputFormat(TableTopInputFormat.class);// ? job.setMapperClass(GenerateMark.class);// generate? job.setPartitionerClass(URLCountPartitioner.class); job.setNumReduceTasks(numLists); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = null; try { r = JobClient.runJob(job); } catch (IOException e) { throw e; } return r; }
From source file:org.hxx.hadoop.GeneratorRedHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment=" + segment); JobConf job = new NutchJob(getConf()); job.setJarByClass(GeneratorRedHbase.class); job.setJobName("generate: from " + table + " " + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis())); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (numLists == -1) { numLists = job.getNumMapTasks(); // a partition per fetch task }/*w w w .j a v a 2 s. c o m*/ if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCENUM, numLists); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(CodeInputFormat.class); job.setNumMapTasks(1); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenerateMark.class); job.setNumReduceTasks(numLists); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = null; try { r = JobClient.runJob(job); } catch (IOException e) { throw e; } return r; }
From source file:org.jd.copier.mapred.DistCp.java
License:Apache License
/** * Initialize DFSCopyFileMapper specific job-configuration. * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments//w w w . j ava 2s . com * @return true if it is necessary to launch a job. */ private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); //set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean skipCRCCheck = args.flags.contains(Options.SKIPCRC); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.SKIPCRC.propertyname, skipCRCCheck); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path stagingArea; try { stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf); } catch (InterruptedException e) { throw new IOException(e); } Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId); FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION); FileSystem.mkdirs(jClient.getFs(), jobDirectory, mapredSysPerms); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP); FileSystem dstfs = args.dst.getFileSystem(conf); // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (null == parent) { // If dst is '/' on S3, it might not exist yet, but dst.getParent() // will return null. In this case, use '/' as its own parent to prevent // NPE errors below. parent = args.dst; } if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { //skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst), skipCRCCheck); //skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); // Explicitly create the tmpDir to ensure that it can be cleaned // up by fullyDelete() later. tmpDir.getFileSystem(conf).mkdirs(tmpDir); LOG.info("sourcePathsCount=" + srcCount); LOG.info("filesToCopyCount=" + fileCount); LOG.info("bytesToCopyCount=" + StringUtils.humanReadableInt(byteCount)); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); return fileCount > 0; }
From source file:org.mitre.ccv.mapred.CalculateKmerRevisedRelativeEntropy.java
License:Open Source License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf()); boolean cleanLogs = false; // @TODO: use commons getopts List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try {//from ww w .ja v a 2 s. com if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-c".equals(args[i])) { cleanLogs = true; } else if ("-t".equals(args[i])) { conf.setBoolean(TEXT_OUTPUT, true); } else if ("-libjars".equals(args[i])) { conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf)); URL[] libjars = FileUtils.getLibJars(conf); if (libjars != null && libjars.length > 0) { // Add libjars to client/tasks classpath conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader())); // Adds libjars to our classpath Thread.currentThread().setContextClassLoader( new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader())); } } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 3) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 3."); return printUsage(); } //return initJob(conf, inTable, sb.toString().trim(), new Path(other_args.get(1))); return initJob(conf, other_args.get(0), other_args.get(1), other_args.get(2), cleanLogs); }
From source file:org.mitre.ccv.mapred.SortKmerRevisedRelativeEntropies.java
License:Open Source License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf()); boolean cleanLogs = false; // @TODO: use commons getopts, org.apache.hadoop.util.GenericOptionsParser used it ArrayList<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try {/*from w w w.jav a2s . co m*/ if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-c".equals(args[i])) { cleanLogs = true; } else if ("-t".equals(args[i])) { conf.setBoolean(TEXT_OUTPUT, true); } else if ("-libjars".equals(args[i])) { conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf)); URL[] libjars = FileUtils.getLibJars(conf); if (libjars != null && libjars.length > 0) { // Add libjars to client/tasks classpath conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader())); // Adds libjars to our classpath Thread.currentThread().setContextClassLoader( new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader())); } } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 3."); return printUsage(); } return initJob(conf, other_args.get(0), other_args.get(1), cleanLogs); }
From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraGenJob.java
License:Apache License
@SuppressWarnings({ "ProhibitedExceptionDeclared" }) @Override/*from w ww.j a va2s . c o m*/ public RunningJob runJob(String[] args) throws Exception { JobConf job = (JobConf) getConf(); setNumberOfRows(job, Long.parseLong(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraGen"); job.setJarByClass(TeraGenJob.class); job.setMapperClass(TeraGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraGenRangeInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setBoolean(ClusterConstants.MAPRED_DISABLE_TOOL_WARNING, true); return JobClient.runJob(job); }
From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraOutputFormat.java
License:Apache License
/** * Set the requirement for a final sync before the stream is closed. *//*from w ww . jav a 2 s . c om*/ public static void setFinalSync(JobConf conf, boolean newValue) { conf.setBoolean(TeraConstants.FINAL_SYNC_ATTRIBUTE, newValue); }
From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraSortJob.java
License:Apache License
@SuppressWarnings("ProhibitedExceptionDeclared") @Override// w w w . j av a 2s . co m public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraConstants.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraConstants.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSortJob.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setBoolean(ClusterConstants.MAPRED_DISABLE_TOOL_WARNING, true); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); job.setInt("mapred.submit.replication", 1); TeraOutputFormat.setFinalSync(job, true); RunningJob runningJob = JobClient.runJob(job); LOG.info("done"); return 0; }