List of usage examples for org.apache.hadoop.mapred JobConf getFloat
public float getFloat(String name, float defaultValue)
name
property as a float
. From source file:StreamWikiDumpInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath();/* w w w . j av a 2 s . com*/ long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); InputStream is = null; long start = 0; long skip = 0; if (is != null) { // start = is.getAdjustedStart(); // length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); // SplitCompressionInputStream cin = // in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { // Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } return splits; }
From source file:FIMReducer.java
License:Apache License
@Override public void configure(JobConf conf) { minFreqPercent = conf.getInt("PARMM.minFreqPercent", 20); sampleSize = conf.getInt("PARMM.sampleSize", 1000); epsilon = conf.getFloat("PARMM.epsilon", 0.05f); id = conf.getInt("mapred.task.partition", -1); set = false;//from ww w .j a v a 2 s. com }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
@Override protected synchronized void submit() { JobConf jobConf = this.getJobConf(); boolean isLocalHadoop = jobConf.get("mapred.job.tracker", "local").equals("local"); // the default partitioner is {@link com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner} // which is hash based. //// w w w .j a v a 2 s .c o m // If user choose to use even partitioner, Mobius will use // {@link com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner} which // is sampling based partitioner of attempting to balance the load // for each reducer. String partitioner = jobConf.get("mobius.partitioner", "default"); if (!isLocalHadoop && jobConf.getNumReduceTasks() != 0 && partitioner.equals("even")) { // this job needs reducer, perform sampling on the keys to // make load on reducers are almost evenly distributed. double freq = jobConf.getFloat("mobius.sampler.freq", 0.1F); int numSamples = jobConf.getInt("mobius.sampler.num.samples", 50000); int maxSplits = jobConf.getInt("mobius.sampler.max.slipts.sampled", 5); // log sampling parameters so that user knows. LOGGER.info("Sampling parameters { " + "mobius.sampler.freq:" + format.format(freq) + ", " + "mobius.sampler.num.samples:" + numSamples + ", " + "mobius.sampler.max.slipts.sampled:" + maxSplits + "}"); InputSampler.Sampler<?, ?> sampler = new MobiusInputSampler(freq, numSamples, maxSplits); writePartitionFile(jobConf, sampler); // add to distributed cache try { URI partitionUri = new URI(TotalOrderPartitioner.getPartitionFile(jobConf) + "#_partitions"); LOGGER.info("Adding partition uri to distributed cache:" + partitionUri.toString()); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); jobConf.setPartitionerClass(EvenlyPartitioner.class); LOGGER.info("Using " + EvenlyPartitioner.class.getCanonicalName() + " to partiton the keys evenly among reducers."); } catch (URISyntaxException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } // adding -XX:-UseParallelOldGC, this will automatically set -XX:-UseParallelGC // according to Oracle's specification String jvmOpts = jobConf.get("mapred.child.java.opts", ""); if (jvmOpts.isEmpty()) { jvmOpts = "-XX:-UseParallelOldGC"; } else { if (jvmOpts.indexOf("-XX:-UseParallelOldGC") < 0) { // remove " jvmOpts = jvmOpts.replaceAll("\"", ""); jvmOpts = jvmOpts.concat(" -XX:-UseParallelOldGC"); } } jobConf.set("mapred.child.java.opts", jvmOpts); this.setJobConf(jobConf); } LOGGER.info("Submiting job:" + jobConf.getJobName()); super.submit(); }
From source file:com.mellanox.hadoop.mapred.UdaPlugin.java
License:Apache License
public UdaPluginRT(UdaShuffleConsumerPluginShared udaShuffleConsumer, ReduceTask reduceTask, JobConf jobConf, Reporter reporter, int numMaps) throws IOException { super(jobConf); this.udaShuffleConsumer = udaShuffleConsumer; this.reduceTask = reduceTask; String totalRdmaSizeStr = jobConf.get("mapred.rdma.shuffle.total.size", "0"); // default 0 means ignoring this parameter and use instead -Xmx and mapred.job.shuffle.input.buffer.percent long totalRdmaSize = StringUtils.TraditionalBinaryPrefix.string2long(totalRdmaSizeStr); long maxRdmaBufferSize = jobConf.getLong("mapred.rdma.buf.size", 1024); long minRdmaBufferSize = jobConf.getLong("mapred.rdma.buf.size.min", 16); long shuffleMemorySize = totalRdmaSize; StringBuilder meminfoSb = new StringBuilder(); meminfoSb.append("UDA: numMaps=").append(numMaps); meminfoSb.append(", maxRdmaBufferSize=").append(maxRdmaBufferSize); meminfoSb.append("KB, minRdmaBufferSize=").append(minRdmaBufferSize).append("KB"); meminfoSb.append("KB, rdmaShuffleTotalSize=").append(totalRdmaSize); if (totalRdmaSize < 0) { LOG.warn("Illegal paramter value: mapred.rdma.shuffle.total.size=" + totalRdmaSize); }//from w w w. j a v a 2s. com if (totalRdmaSize <= 0) { long maxHeapSize = Runtime.getRuntime().maxMemory(); double shuffleInputBufferPercent = jobConf.getFloat("mapred.job.shuffle.input.buffer.percent", DEFAULT_SHUFFLE_INPUT_PERCENT); if ((shuffleInputBufferPercent < 0) || (shuffleInputBufferPercent > 1)) { LOG.warn("UDA: mapred.job.shuffle.input.buffer.percent is out of range - set to default: " + DEFAULT_SHUFFLE_INPUT_PERCENT); shuffleInputBufferPercent = DEFAULT_SHUFFLE_INPUT_PERCENT; } shuffleMemorySize = (long) (maxHeapSize * shuffleInputBufferPercent); LOG.info("Using JAVA Xmx with mapred.job.shuffle.input.buffer.percent to limit UDA shuffle memory"); meminfoSb.append(", maxHeapSize=").append(maxHeapSize).append("B"); meminfoSb.append(", shuffleInputBufferPercent=").append(shuffleInputBufferPercent); meminfoSb.append("==> shuffleMemorySize=").append(shuffleMemorySize).append("B"); LOG.info("RDMA shuffle memory is limited to " + shuffleMemorySize / 1024 / 1024 + "MB"); } else { LOG.info("Using mapred.rdma.shuffle.total.size to limit UDA shuffle memory"); LOG.info("RDMA shuffle memory is limited to " + totalRdmaSize / 1024 / 1024 + "MB"); } LOG.debug(meminfoSb.toString()); LOG.info("UDA: user prefer rdma.buf.size=" + maxRdmaBufferSize + "KB"); LOG.info("UDA: minimum rdma.buf.size=" + minRdmaBufferSize + "KB"); if (jobConf.getSpeculativeExecution()) { // (getMapSpeculativeExecution() || getReduceSpeculativeExecution()) LOG.info("UDA has limited support for map task speculative execution"); } LOG.info("UDA: number of segments to fetch: " + numMaps); /* init variables */ init_kv_bufs(); launchCppSide(true, this); // true: this is RT => we should execute NetMerger this.j2c_queue = new J2CQueue<K, V>(); this.mTaskReporter = reporter; this.mMapsNeed = numMaps; /* send init message */ TaskAttemptID reduceId = reduceTask.getTaskID(); mParams.clear(); mParams.add(Integer.toString(numMaps)); mParams.add(reduceId.getJobID().toString()); mParams.add(reduceId.toString()); mParams.add(jobConf.get("mapred.netmerger.hybrid.lpq.size", "0")); mParams.add(Long.toString(maxRdmaBufferSize * 1024)); // in Bytes - pass the raw value we got from xml file (with only conversion to bytes) mParams.add(Long.toString(minRdmaBufferSize * 1024)); // in Bytes . passed for checking if rdmaBuffer is still larger than minRdmaBuffer after alignment mParams.add(jobConf.getOutputKeyClass().getName()); boolean compression = jobConf.getCompressMapOutput(); //"true" or "false" String alg = null; if (compression) { alg = jobConf.get("mapred.map.output.compression.codec", null); } mParams.add(alg); String bufferSize = Integer.toString(256 * 1024); if (alg != null) { if (alg.contains("lzo.LzoCodec")) { bufferSize = jobConf.get("io.compression.codec.lzo.buffersize", bufferSize); } else if (alg.contains("SnappyCodec")) { bufferSize = jobConf.get("io.compression.codec.snappy.buffersize", bufferSize); } } mParams.add(bufferSize); mParams.add(Long.toString(shuffleMemorySize)); String[] dirs = jobConf.getLocalDirs(); ArrayList<String> dirsCanBeCreated = new ArrayList<String>(); //checking if the directories can be created for (int i = 0; i < dirs.length; i++) { try { DiskChecker.checkDir(new File(dirs[i].trim())); //saving only the directories that can be created dirsCanBeCreated.add(dirs[i].trim()); } catch (DiskErrorException e) { } } //sending the directories int numDirs = dirsCanBeCreated.size(); mParams.add(Integer.toString(numDirs)); for (int i = 0; i < numDirs; i++) { mParams.add(dirsCanBeCreated.get(i)); } LOG.info("mParams array is " + mParams); LOG.info("UDA: sending INIT_COMMAND"); String msg = UdaCmd.formCmd(UdaCmd.INIT_COMMAND, mParams); UdaBridge.doCommand(msg); this.mProgress = new Progress(); this.mProgress.set(0.5f); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapJoin.java
License:Apache License
@Override public void configure(JobConf job) { ///*from www .j a v a2 s . c o m*/ // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns[0] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); dataColumns[1] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE)); // // get suffix for second relation // suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "") .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1]; }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapSelfJoin.java
License:Apache License
@Override public void configure(JobConf job) { ////from www.j a va 2 s .c o m // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.ReduceJoin.java
License:Apache License
@Override public void configure(JobConf job) { similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.token.MapJoin.java
License:Apache License
@Override public void configure(JobConf job) { ///*from w ww.j a v a2 s .co m*/ // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank // Path tokensPath; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); } else { tokensPath = cache[0]; } } catch (IOException e) { throw new RuntimeException(e); } TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns[0] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); dataColumns[1] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE)); // // get suffix for second relation // suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "") .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1]; }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.token.MapSelfJoin.java
License:Apache License
@Override public void configure(JobConf job) { ////from www .j a va 2 s .c o m // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank // Path tokensPath; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); } else { tokensPath = cache[0]; } } catch (IOException e) { throw new RuntimeException(e); } TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.token.ReduceVerifyListJoin.java
License:Apache License
@Override public void configure(JobConf job) { ///*w w w .ja v a 2 s . c o m*/ // set SimilarityFilters // String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // similarityMetric = SimilarityMetricFactory // .getSimilarityMetric(similarityName); }