List of usage examples for org.apache.hadoop.mapred JobConf getLong
public long getLong(String name, long defaultValue)
name
property as a long
. From source file:org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * //from ww w . j a v a 2 s. co m * @param conf the job to sample * @param partFile where to write the output file to * @return index value * @throws IOException if something goes wrong * @throws InstantiationException if InstantiationException occurs * @throws IllegalAccessException if IllegalAccessException occurs */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; i++) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = null; int index0 = -1; try { writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; } finally { IOUtilFunctions.closeSilently(writer); } return index0; }
From source file:org.apache.tez.mapreduce.hadoop.TestDeprecatedKeys.java
License:Apache License
@Test(timeout = 5000) public void verifyReduceKeyTranslation() { JobConf jobConf = new JobConf(); jobConf.setFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.4f); jobConf.setLong(MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES, 20000l); jobConf.setInt(MRJobConfig.IO_SORT_FACTOR, 2000); jobConf.setFloat(MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, 0.55f); jobConf.setFloat(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, 0.60f); jobConf.setFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.22f); jobConf.setBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, true); jobConf.setFloat(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT, 0.33f); MRHelpers.translateMRConfToTez(jobConf); assertEquals(0.4f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0f), 0.01f);//from w w w. ja va 2 s.c o m assertEquals(20000l, jobConf.getLong(Constants.TEZ_RUNTIME_TASK_MEMORY, 0)); assertEquals(2000, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 0)); assertEquals(0.55f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 0), 0.01f); assertEquals(0.60f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 0), 0.01f); assertEquals(0.22f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 0), 0.01f); assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, false)); assertEquals(0.33f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 0), 0.01f); }
From source file:org.archive.mapred.ARCMapRunner.java
License:LGPL
public void configure(JobConf job) { this.mapper = (ARCRecordMapper) ReflectionUtils.newInstance(job.getMapperClass(), job); // Value is in minutes. this.maxtime = job.getLong("wax.index.timeout", 60) * 60 * 1000; }
From source file:org.commoncrawl.mapred.pipelineV3.RegExFilter.java
License:Open Source License
@Override public void configure(JobConf job) { super.configure(job); String regEx = job.get(REGEX_KEY); pattern = Pattern.compile(regEx); inputFile = job.get("map.input.file"); inputFilePos = job.getLong("map.input.start", -1L); }
From source file:org.hxx.hadoop.URLCountPartitioner.java
License:Apache License
public void configure(JobConf job) { seed = job.getInt("partition.url.seed", 0); normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION); topn = job.getLong(Generator.GENERATOR_TOP_N, 100000); hostn = job.getInt(Generator.GENERATOR_MAX_COUNT, -1); // cntStr = job.get(GeneratorHbase.GENERATL_CNT);// ? // int reduceNum = job.getInt(GeneratorHbase.GENERATL_REDUCENUM, 1); // initPart(reduceNum); }
From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample keys. By default reads 100,000 keys from 10 * locations in the input, sorts them and picks N-1 keys to generate N equally sized partitions. * * @param conf the job to sample//from ww w.j a v a 2 s . com * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(JobConf conf, Path partFile) throws IOException { TeraInputFormat inFormat = new TeraInputFormat(); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(TeraConstants.SAMPLE_SIZE, 100000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; long records = 0; // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null); while (reader.next(key, value)) { sampler.addKey(key); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for (Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context/*from w w w.java 2 s .c o m*/ * @throws IOException */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { LOG.info("StreamWikiDumpInputFormat.getSplits job=" + job + " n=" + numSplits); InputSplit[] oldSplits = super.getSplits(job, numSplits); List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(job); // Save the number of input files for metrics/loadgen job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long minSize = job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1); long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); for (FileStatus file : files) { if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); LOG.info(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize)); //System.err.println(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize)); for (InputSplit x : getSplits(job, file, pageBeginPattern, splitSize)) splits.add(x); } System.err.println("splits=" + splits); return splits.toArray(new InputSplit[splits.size()]); }
From source file:rugal.hadoop.repartition.enhanced.impl.OptimizedDataJoinReducerBase.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); this.job = job; this.maxNumOfValuesPerGroup = job.getLong("datajoin.maxNumOfValuesPerGroup", 100); }
From source file:skewtune.mapreduce.STJobTracker.java
License:Apache License
@SuppressWarnings("unchecked") STJobTracker(final JobConf conf, String jobtrackerIndentifier) throws IOException, InterruptedException { // find the owner of the process // get the desired principal to load String keytabFilename = conf.get(JTConfig.JT_KEYTAB_FILE); UserGroupInformation.setConfiguration(conf); if (keytabFilename != null) { String desiredUser = conf.get(JTConfig.JT_USER_NAME, System.getProperty("user.name")); UserGroupInformation.loginUserFromKeytab(desiredUser, keytabFilename); mrOwner = UserGroupInformation.getLoginUser(); } else {/*from www . ja va 2 s . c o m*/ mrOwner = UserGroupInformation.getCurrentUser(); } supergroup = conf.get(MR_SUPERGROUP, "supergroup"); LOG.info("Starting jobtracker with owner as " + mrOwner.getShortUserName() + " and supergroup as " + supergroup); long secretKeyInterval = conf.getLong(MRConfig.DELEGATION_KEY_UPDATE_INTERVAL_KEY, MRConfig.DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT); long tokenMaxLifetime = conf.getLong(MRConfig.DELEGATION_TOKEN_MAX_LIFETIME_KEY, MRConfig.DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT); long tokenRenewInterval = conf.getLong(MRConfig.DELEGATION_TOKEN_RENEW_INTERVAL_KEY, MRConfig.DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT); secretManager = new DelegationTokenSecretManager(secretKeyInterval, tokenMaxLifetime, tokenRenewInterval, DELEGATION_TOKEN_GC_INTERVAL); secretManager.startThreads(); // // Grab some static constants // NUM_HEARTBEATS_IN_SECOND = conf.getInt(JT_HEARTBEATS_IN_SECOND, DEFAULT_NUM_HEARTBEATS_IN_SECOND); if (NUM_HEARTBEATS_IN_SECOND < MIN_NUM_HEARTBEATS_IN_SECOND) { NUM_HEARTBEATS_IN_SECOND = DEFAULT_NUM_HEARTBEATS_IN_SECOND; } HEARTBEATS_SCALING_FACTOR = conf.getFloat(JT_HEARTBEATS_SCALING_FACTOR, DEFAULT_HEARTBEATS_SCALING_FACTOR); if (HEARTBEATS_SCALING_FACTOR < MIN_HEARTBEATS_SCALING_FACTOR) { HEARTBEATS_SCALING_FACTOR = DEFAULT_HEARTBEATS_SCALING_FACTOR; } // whether to dump or not every heartbeat message even when DEBUG is enabled dumpHeartbeat = conf.getBoolean(JT_HEARTBEATS_DUMP, false); // This is a directory of temporary submission files. We delete it // on startup, and can delete any files that we're done with this.conf = conf; JobConf jobConf = new JobConf(conf); // Set ports, start RPC servers, setup security policy etc. InetSocketAddress addr = getAddress(conf); this.localMachine = addr.getHostName(); this.port = addr.getPort(); int handlerCount = conf.getInt(JT_IPC_HANDLER_COUNT, 10); this.interTrackerServer = RPC.getServer(SkewTuneClientProtocol.class, this, addr.getHostName(), addr.getPort(), handlerCount, false, conf, secretManager); if (LOG.isDebugEnabled()) { Properties p = System.getProperties(); for (Iterator it = p.keySet().iterator(); it.hasNext();) { String key = (String) it.next(); String val = p.getProperty(key); LOG.debug("Property '" + key + "' is " + val); } } InetSocketAddress infoSocAddr = NetUtils .createSocketAddr(conf.get(JT_HTTP_ADDRESS, String.format("%s:0", this.localMachine))); String infoBindAddress = infoSocAddr.getHostName(); int tmpInfoPort = infoSocAddr.getPort(); this.startTime = System.currentTimeMillis(); infoServer = new HttpServer("job", infoBindAddress, tmpInfoPort, tmpInfoPort == 0, conf); infoServer.setAttribute("job.tracker", this); infoServer.addServlet("jobcompletion", "/completion", JobCompletionServlet.class); infoServer.addServlet("taskspeculation", "/speculation", SpeculationEventServlet.class); infoServer.addServlet("skewreport", "/skew", SkewReportServlet.class); infoServer.addServlet("tasksplit", "/split/*", SplitTaskServlet.class); infoServer.addServlet("tasksplitV2", "/splitV2/*", SplitTaskV2Servlet.class); infoServer.start(); this.trackerIdentifier = jobtrackerIndentifier; // The rpc/web-server ports can be ephemeral ports... // ... ensure we have the correct info this.port = interTrackerServer.getListenerAddress().getPort(); this.conf.set(JT_IPC_ADDRESS, (this.localMachine + ":" + this.port)); LOG.info("JobTracker up at: " + this.port); this.infoPort = this.infoServer.getPort(); this.conf.set(JT_HTTP_ADDRESS, infoBindAddress + ":" + this.infoPort); LOG.info("JobTracker webserver: " + this.infoServer.getPort()); this.defaultNotificationUrl = String.format("http://%s:%d/completion?jobid=$jobId&status=$jobStatus", infoBindAddress, this.infoPort); LOG.info("JobTracker completion URI: " + defaultNotificationUrl); // this.defaultSpeculationEventUrl = String.format("http://%s:%d/speculation?taskid=$taskId&remainTime=$taskRemainTime",infoBindAddress,this.infoPort); this.defaultSpeculationEventUrl = String.format("http://%s:%d/speculation?jobid=$jobId", infoBindAddress, this.infoPort); LOG.info("JobTracker speculation event URI: " + defaultSpeculationEventUrl); this.defaultSkewReportUrl = String.format("http://%s:%d/skew", infoBindAddress, this.infoPort); LOG.info("JobTracker skew report event URI: " + defaultSkewReportUrl); this.trackerHttp = String.format("http://%s:%d", infoBindAddress, this.infoPort); while (!Thread.currentThread().isInterrupted()) { try { // if we haven't contacted the namenode go ahead and do it if (fs == null) { fs = mrOwner.doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws IOException { return FileSystem.get(conf); } }); } // clean up the system dir, which will only work if hdfs is out // of safe mode if (systemDir == null) { systemDir = new Path(getSystemDir()); } try { FileStatus systemDirStatus = fs.getFileStatus(systemDir); if (!systemDirStatus.getOwner().equals(mrOwner.getShortUserName())) { throw new AccessControlException( "The systemdir " + systemDir + " is not owned by " + mrOwner.getShortUserName()); } if (!systemDirStatus.getPermission().equals(SYSTEM_DIR_PERMISSION)) { LOG.warn("Incorrect permissions on " + systemDir + ". Setting it to " + SYSTEM_DIR_PERMISSION); fs.setPermission(systemDir, new FsPermission(SYSTEM_DIR_PERMISSION)); } else { break; } } catch (FileNotFoundException fnf) { } // ignore } catch (AccessControlException ace) { LOG.warn("Failed to operate on " + JTConfig.JT_SYSTEM_DIR + "(" + systemDir + ") because of permissions."); LOG.warn("Manually delete the " + JTConfig.JT_SYSTEM_DIR + "(" + systemDir + ") and then start the JobTracker."); LOG.warn("Bailing out ... "); throw ace; } catch (IOException ie) { LOG.info("problem cleaning system directory: " + systemDir, ie); } Thread.sleep(FS_ACCESS_RETRY_PERIOD); } if (Thread.currentThread().isInterrupted()) { throw new InterruptedException(); } // initialize cluster variable cluster = new Cluster(this.conf); // now create a job client proxy jtClient = (ClientProtocol) RPC.getProxy(ClientProtocol.class, ClientProtocol.versionID, JobTracker.getAddress(conf), mrOwner, this.conf, NetUtils.getSocketFactory(conf, ClientProtocol.class)); new SpeculativeScheduler().start(); // initialize task event fetcher new TaskCompletionEventFetcher().start(); // Same with 'localDir' except it's always on the local disk. asyncDiskService = new MRAsyncDiskService(FileSystem.getLocal(conf), conf.getLocalDirs()); asyncDiskService.moveAndDeleteFromEachVolume(SUBDIR); // keep at least one asynchronous worker per CPU core int numProcs = Runtime.getRuntime().availableProcessors(); LOG.info("# of available processors = " + numProcs); int maxFactor = conf.getInt(JT_MAX_ASYNC_WORKER_FACTOR, 2); asyncWorkers = new ThreadPoolExecutor(numProcs, numProcs * maxFactor, 30, TimeUnit.SECONDS, new SynchronousQueue<Runnable>(true), new ThreadPoolExecutor.CallerRunsPolicy()); speculativeSplit = conf.getBoolean(JT_SPECULATIVE_SPLIT, false); }
From source file:StorageEngineClient.CombineColumnStorageFileInputFormat.java
License:Open Source License
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;/*from w w w. ja va 2s . c o m*/ } else { minSizeNode = job.getLong("mapred.min.split.size.per.node", 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = job.getLong("mapred.max.split.size", 0); } if (maxSize == 0) { maxSize = (long) (job.getLong("dfs.block.size", 512 * 1024 * 1024) * 0.8); } if (minSizeNode == 0) { minSizeNode = maxSize / 2; } if (minSizeRack == 0) { minSizeRack = maxSize / 2; } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split size per rack " + minSizeRack); } Path[] paths = FileUtil.stat2Paths(listStatus(job)); List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>(); if (paths.length == 0) { return splits.toArray(new CombineFileSplit[splits.size()]); } for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); for (int i = 0; i < paths.length; i++) { if (paths[i] == null) { continue; } Path p = new Path(paths[i].toUri().getPath()); if (onepool.accept(p)) { myPaths.add(paths[i]); paths[i] = null; } } getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } ArrayList<Path> myPaths = new ArrayList<Path>(); for (int i = 0; i < paths.length; i++) { if (paths[i] == null) { continue; } myPaths.add(paths[i]); } LOG.info("myPaths size:\t" + myPaths.size()); getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); if (splits.size() == 0) return super.getSplits(job, numSplits); LOG.info("splits #:\t" + splits.size()); return splits.toArray(new CombineFileSplit[splits.size()]); }