List of usage examples for org.apache.hadoop.mapred JobConf getInt
public int getInt(String name, int defaultValue)
name
property as an int
. From source file:StorageEngineClient.CombineFileInputFormat.java
License:Open Source License
private void processsplit(JobConf job, Map.Entry<String, List<OneBlockInfo>> one, HashMap<OneBlockInfo, String[]> blockToNodes, long maxSize, long minSizeNode, long minSizeRack, List<CombineFileSplit> splits, String type) { ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>(); ArrayList<String> nodes = new ArrayList<String>(); long curSplitSize = 0; if (type.equals("node")) nodes.add(one.getKey());/*from ww w . ja v a 2 s . com*/ List<OneBlockInfo> blocks = null; if (!type.equals("all")) { blocks = one.getValue(); } else { blocks = new ArrayList<OneBlockInfo>(); blocks.addAll(blockToNodes.keySet()); } OneBlockInfo[] blocksInNodeArr = blocks.toArray(new OneBlockInfo[blocks.size()]); if (job.getBoolean("hive.merge.inputfiles.sort", true)) { Arrays.sort(blocksInNodeArr, new Comparator<OneBlockInfo>() { @Override public int compare(OneBlockInfo o1, OneBlockInfo o2) { return (int) (o2.length - o1.length); } }); } if (job.getBoolean("hive.merge.inputfiles.rerange", false)) { Random r = new Random(123456); OneBlockInfo tmp = null; for (int i = 0; i < blocksInNodeArr.length; i++) { int idx = r.nextInt(blocksInNodeArr.length); tmp = blocksInNodeArr[i]; blocksInNodeArr[i] = blocksInNodeArr[idx]; blocksInNodeArr[idx] = tmp; } } int maxFileNumPerSplit = job.getInt("hive.merge.inputfiles.maxFileNumPerSplit", 1000); for (int i = 0; i < blocksInNodeArr.length; i++) { if (blockToNodes.containsKey(blocksInNodeArr[i])) { if (!type.equals("node")) { nodes.clear(); } curSplitSize = blocksInNodeArr[i].length; validBlocks.clear(); validBlocks.add(blocksInNodeArr[i]); blockToNodes.remove(blocksInNodeArr[i]); if (maxSize != 0 && curSplitSize >= maxSize) { addCreatedSplit(job, splits, nodes, validBlocks); } else { int filenum = 1; for (int j = i + 1; j < blocksInNodeArr.length; j++) { if (blockToNodes.containsKey(blocksInNodeArr[j])) { long size1 = blocksInNodeArr[j].length; if (maxSize != 0 && curSplitSize + size1 <= maxSize) { curSplitSize += size1; filenum++; validBlocks.add(blocksInNodeArr[j]); blockToNodes.remove(blocksInNodeArr[j]); if (!type.equals("node")) for (int k = 0; k < blocksInNodeArr[j].hosts.length; k++) { nodes.add(blocksInNodeArr[j].hosts[k]); } } if (filenum >= maxFileNumPerSplit) { break; } } } if (minSizeNode != 0 && curSplitSize >= minSizeNode) { addCreatedSplit(job, splits, nodes, validBlocks); } else { for (OneBlockInfo oneblock : validBlocks) { blockToNodes.put(oneblock, oneblock.hosts); } break; } } } } }
From source file:StorageEngineClient.CombineFileInputFormat.java
License:Open Source License
private void processsplitForUnsplit(JobConf job, Map.Entry<String, List<OneBlockInfo>> one, HashMap<OneBlockInfo, String[]> blockToNodes, long maxSize, long minSizeNode, long minSizeRack, List<CombineFileSplit> splits, String type) { ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>(); ArrayList<String> nodes = new ArrayList<String>(); long curSplitSize = 0; if (type.equals("node")) nodes.add(one.getKey());/*www .ja v a 2s.c o m*/ List<OneBlockInfo> blocks = null; if (!type.equals("all")) { blocks = one.getValue(); } else { blocks = new ArrayList<OneBlockInfo>(); blocks.addAll(blockToNodes.keySet()); } OneBlockInfo[] blocksInNodeArr = blocks.toArray(new OneBlockInfo[blocks.size()]); if (job.getBoolean("hive.merge.inputfiles.sort", true)) { Arrays.sort(blocksInNodeArr, new Comparator<OneBlockInfo>() { @Override public int compare(OneBlockInfo o1, OneBlockInfo o2) { long comparereuslt = o2.length - o1.length; int result = 0; if (comparereuslt > 0) result = 1; if (comparereuslt < 0) result = -1; return result; } }); } if (job.getBoolean("hive.merge.inputfiles.rerange", false)) { Random r = new Random(123456); OneBlockInfo tmp = null; for (int i = 0; i < blocksInNodeArr.length; i++) { int idx = r.nextInt(blocksInNodeArr.length); tmp = blocksInNodeArr[i]; blocksInNodeArr[i] = blocksInNodeArr[idx]; blocksInNodeArr[idx] = tmp; } } int maxFileNumPerSplit = job.getInt("hive.merge.inputfiles.maxFileNumPerSplit", 1000); for (int i = 0; i < blocksInNodeArr.length; i++) { if (blockToNodes.containsKey(blocksInNodeArr[i])) { if (!type.equals("node")) { nodes.clear(); } curSplitSize = blocksInNodeArr[i].length; validBlocks.clear(); validBlocks.add(blocksInNodeArr[i]); blockToNodes.remove(blocksInNodeArr[i]); if (maxSize != 0 && curSplitSize >= maxSize) { if (!type.equals("node")) { for (int k = 0; k < blocksInNodeArr[i].hosts.length; k++) { nodes.add(blocksInNodeArr[i].hosts[k]); } } addCreatedSplit(job, splits, nodes, validBlocks); } else { int filenum = 1; for (int j = i + 1; j < blocksInNodeArr.length; j++) { if (blockToNodes.containsKey(blocksInNodeArr[j])) { long size1 = blocksInNodeArr[j].length; if (maxSize != 0 && curSplitSize < maxSize) { curSplitSize += size1; filenum++; validBlocks.add(blocksInNodeArr[j]); blockToNodes.remove(blocksInNodeArr[j]); } if (filenum >= maxFileNumPerSplit) { break; } if (curSplitSize >= maxSize) { break; } } } if (minSizeNode != 0 && curSplitSize >= minSizeNode) { if (!type.equals("node")) { generateNodesInfo(validBlocks, nodes); } addCreatedSplit(job, splits, nodes, validBlocks); } else { for (OneBlockInfo oneblock : validBlocks) { blockToNodes.put(oneblock, oneblock.hosts); } break; } } } } HashSet<OneBlockInfo> hs = new HashSet<OneBlockInfo>(); while (blockToNodes.size() > 0) { validBlocks = new ArrayList<OneBlockInfo>(); nodes = new ArrayList<String>(); int filenum = 0; hs.clear(); for (OneBlockInfo blockInfo : blockToNodes.keySet()) { filenum++; validBlocks.add(blockInfo); hs.add(blockInfo); if (filenum >= maxFileNumPerSplit) { break; } } for (OneBlockInfo blockInfo : hs) { blockToNodes.remove(blockInfo); } generateNodesInfo(validBlocks, nodes); this.addCreatedSplit(job, splits, nodes, validBlocks); } }
From source file:StorageEngineClient.FormatStorageInputFormat_SplitByLineNum.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { List<FormatStorageInputSplit_WithLineNum> splits = new ArrayList<FormatStorageInputSplit_WithLineNum>(); int lenNum = job.getInt("hive.inputfiles.line_num_per_split", 1000000); if (lenNum < 10000) { LOG.info("lenNum been set to " + lenNum + " is too small, so set it to 1000000"); lenNum = 1000000;// w w w. j a va 2 s .c o m } FileStatus[] fss = listStatus(job); FileStatus[] orignalFss = fss; List<FileStatus> fssList = new ArrayList<FileStatus>(); for (int i = 0; i < fss.length; i++) { if (fss[i].getLen() > 0) { fssList.add(fss[i]); } } fss = (FileStatus[]) fssList.toArray(new FileStatus[0]); int listSize = fss.length; if (listSize == 0) { mapredWork mrWork = Utilities.getMapRedWork(job); Path inputPath = orignalFss[0].getPath(); Path inputParentPath = inputPath.getParent(); String inputPathStr = inputPath.toUri().toString(); String inputPathParentStr = inputParentPath.toString(); FileSystem fs = inputPath.getFileSystem(job); fs.delete(inputPath, true); LinkedHashMap<String, partitionDesc> partDescMap = mrWork.getPathToPartitionInfo(); partitionDesc partDesc = partDescMap.get(inputPathParentStr); job.setBoolean("NeedPostfix", false); RecordWriter recWriter = new FormatStorageHiveOutputFormat().getHiveRecordWriter(job, inputPath, Text.class, false, partDesc.getTableDesc().getProperties(), null); recWriter.close(false); job.setBoolean("NeedPostfix", true); fss = listStatus(job); } Random r = new Random(123456); for (int i = 0; i < fss.length; i++) { int x = r.nextInt(fss.length); FileStatus tmp = fss[i]; fss[i] = fss[x]; fss[x] = tmp; } int[] fslengths = new int[fss.length]; for (int i = 0; i < fss.length; i++) { IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(fss[i].getPath().toString()); fslengths[i] = ifdf.recnum(); ifdf.close(); } int id = 0; int offset = 0; int currlen = 0; ArrayList<FileSplit> currFileSplits = new ArrayList<FormatStorageInputFormat_SplitByLineNum.FileSplit>(); while (true) { int need = lenNum - currlen; int remain = fslengths[id] - offset; if (need <= remain) { currFileSplits.add(new FileSplit(fss[id].getPath().toString(), offset, need)); splits.add(new FormatStorageInputSplit_WithLineNum( currFileSplits.toArray(new FileSplit[currFileSplits.size()]), fss[id].getPath().getFileSystem(job).getFileBlockLocations(fss[id], 0, fss[id].getLen())[0] .getHosts())); currFileSplits.clear(); currlen = 0; offset += need; } else { if (remain != 0) { currFileSplits.add(new FileSplit(fss[id].getPath().toString(), offset, remain)); } id++; offset = 0; currlen += remain; } if (id == fss.length) { if (currFileSplits.size() != 0) { splits.add(new FormatStorageInputSplit_WithLineNum( currFileSplits.toArray(new FileSplit[currFileSplits.size()]), fss[id - 1].getPath().getFileSystem(job).getFileBlockLocations(fss[id - 1], 0, fss[id - 1].getLen())[0].getHosts())); } break; } } if (splits.size() == 0) { ArrayList<FileSplit> emptyFileSplits = new ArrayList<FormatStorageInputFormat_SplitByLineNum.FileSplit>(); emptyFileSplits.add(new FileSplit(fss[0].getPath().toString(), 0, 0)); splits.add(new FormatStorageInputSplit_WithLineNum( emptyFileSplits.toArray(new FileSplit[emptyFileSplits.size()]), fss[0].getPath().getFileSystem(job).getFileBlockLocations(fss[0], 0, fss[0].getLen())[0] .getHosts())); } for (int i = 0; i < splits.size(); i++) { LOG.info(splits.get(i).toString()); } LOG.info("Total # of splits: " + splits.size()); return splits.toArray(new FormatStorageInputSplit_WithLineNum[splits.size()]); }
From source file:tachyon.hadoop.fs.IOMapperBase.java
License:Apache License
public void configure(JobConf conf) { setConf(conf);/*from ww w . j a v a 2 s . c o m*/ try { mFS = FileSystem.get(conf); } catch (Exception e) { throw new RuntimeException("Cannot create file system.", e); } mBufferSize = conf.getInt("test.io.file.buffer.size", 4096); mBuffer = new byte[mBufferSize]; try { mHostname = InetAddress.getLocalHost().getHostName(); } catch (Exception e) { mHostname = "localhost"; } }
From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGenerator.java
License:Open Source License
/** * //ww w .j a va 2 s. c om * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { LOG.info("Loading config from: " + configPath); index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { LOG.info("Using default config: mdx"); index_conf = ConfigFactory.load("mdx"); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config: " + index_conf.getString("warc.title")); // Reducer count: int numReducers = 10; if (index_conf.hasPath(WARC_HADOOP_NUM_REDUCERS)) { numReducers = index_conf.getInt(WARC_HADOOP_NUM_REDUCERS); } if (conf.getInt(WARC_HADOOP_NUM_REDUCERS, -1) != -1) { LOG.info("Overriding num_reducers using Hadoop config."); numReducers = conf.getInt(WARC_HADOOP_NUM_REDUCERS, numReducers); } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCMDXMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); // SequenceFileOutputFormat.setOutputCompressionType(conf, // CompressionType.BLOCK); // OR TextOutputFormat? // conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); }
From source file:voldemort.store.readonly.disk.HadoopStoreWriterPerBucket.java
License:Apache License
@Override public void conf(JobConf job) { JobConf conf = job; try {//from ww w .j ava2 s . com this.cluster = new ClusterMapper().readCluster(new StringReader(conf.get("cluster.xml"))); List<StoreDefinition> storeDefs = new StoreDefinitionsMapper() .readStoreList(new StringReader(conf.get("stores.xml"))); if (storeDefs.size() != 1) throw new IllegalStateException("Expected to find only a single store, but found multiple!"); this.storeDef = storeDefs.get(0); this.numChunks = conf.getInt("num.chunks", -1); if (this.numChunks < 1) throw new VoldemortException("num.chunks not specified in the job conf."); this.saveKeys = conf.getBoolean("save.keys", false); this.reducerPerBucket = conf.getBoolean("reducer.per.bucket", false); this.conf = job; this.outputDir = job.get("final.output.dir"); this.taskId = job.get("mapred.task.id"); this.checkSumType = CheckSum.fromString(job.get("checksum.type")); this.checkSumDigestIndex = new CheckSum[getNumChunks()]; this.checkSumDigestValue = new CheckSum[getNumChunks()]; this.position = new int[getNumChunks()]; this.taskIndexFileName = new Path[getNumChunks()]; this.taskValueFileName = new Path[getNumChunks()]; this.indexFileStream = new DataOutputStream[getNumChunks()]; this.valueFileStream = new DataOutputStream[getNumChunks()]; for (int chunkId = 0; chunkId < getNumChunks(); chunkId++) { this.checkSumDigestIndex[chunkId] = CheckSum.getInstance(checkSumType); this.checkSumDigestValue[chunkId] = CheckSum.getInstance(checkSumType); this.position[chunkId] = 0; this.taskIndexFileName[chunkId] = new Path(FileOutputFormat.getOutputPath(job), getStoreName() + "." + Integer.toString(chunkId) + "_" + this.taskId + ".index"); this.taskValueFileName[chunkId] = new Path(FileOutputFormat.getOutputPath(job), getStoreName() + "." + Integer.toString(chunkId) + "_" + this.taskId + ".data"); if (this.fs == null) this.fs = this.taskIndexFileName[chunkId].getFileSystem(job); this.indexFileStream[chunkId] = fs.create(this.taskIndexFileName[chunkId]); fs.setPermission(this.taskIndexFileName[chunkId], new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + this.taskIndexFileName[chunkId]); this.valueFileStream[chunkId] = fs.create(this.taskValueFileName[chunkId]); fs.setPermission(this.taskValueFileName[chunkId], new FsPermission(HadoopStoreBuilder.HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + this.taskValueFileName[chunkId]); logger.info("Opening " + this.taskIndexFileName[chunkId] + " and " + this.taskValueFileName[chunkId] + " for writing."); } } catch (IOException e) { // throw new RuntimeException("Failed to open Input/OutputStream", // e); e.printStackTrace(); } }
From source file:voldemort.store.readonly.mr.AbstractStoreBuilderConfigurable.java
License:Apache License
public void configure(JobConf conf) { this.cluster = new ClusterMapper().readCluster(new StringReader(conf.get("cluster.xml"))); List<StoreDefinition> storeDefs = new StoreDefinitionsMapper() .readStoreList(new StringReader(conf.get("stores.xml"))); if (storeDefs.size() != 1) throw new IllegalStateException("Expected to find only a single store, but found multiple!"); this.storeDef = storeDefs.get(0); this.numChunks = conf.getInt("num.chunks", -1); if (this.numChunks < 1) throw new VoldemortException("num.chunks not specified in the job conf."); this.saveKeys = conf.getBoolean("save.keys", false); this.reducerPerBucket = conf.getBoolean("reducer.per.bucket", false); }
From source file:voldemort.store.readonly.mr.AvroStoreBuilderMapper.java
License:Apache License
@Override public void configure(JobConf conf) { super.setConf(conf); // from parent code md5er = ByteUtils.getDigest("md5"); this.cluster = new ClusterMapper().readCluster(new StringReader(conf.get("cluster.xml"))); List<StoreDefinition> storeDefs = new StoreDefinitionsMapper() .readStoreList(new StringReader(conf.get("stores.xml"))); if (storeDefs.size() != 1) throw new IllegalStateException("Expected to find only a single store, but found multiple!"); this.storeDef = storeDefs.get(0); this.numChunks = conf.getInt("num.chunks", -1); if (this.numChunks < 1) throw new VoldemortException("num.chunks not specified in the job conf."); this.saveKeys = conf.getBoolean("save.keys", true); this.reducerPerBucket = conf.getBoolean("reducer.per.bucket", false); keySerializerDefinition = getStoreDef().getKeySerializer(); valueSerializerDefinition = getStoreDef().getValueSerializer(); try {// ww w . j a va2 s. co m SerializerFactory factory = new DefaultSerializerFactory(); if (conf.get("serializer.factory") != null) { factory = (SerializerFactory) Class.forName(conf.get("serializer.factory")).newInstance(); } keySerializer = factory.getSerializer(keySerializerDefinition); valueSerializer = factory.getSerializer(valueSerializerDefinition); keyField = conf.get("avro.key.field"); valField = conf.get("avro.value.field"); keySchema = conf.get("avro.key.schema"); valSchema = conf.get("avro.val.schema"); if (keySerializerDefinition.getName().equals("avro-generic")) { keySerializer = new AvroGenericSerializer(keySchema); valueSerializer = new AvroGenericSerializer(valSchema); } else { if (keySerializerDefinition.hasVersion()) { Map<Integer, String> versions = new HashMap<Integer, String>(); for (Map.Entry<Integer, String> entry : keySerializerDefinition.getAllSchemaInfoVersions() .entrySet()) versions.put(entry.getKey(), entry.getValue()); keySerializer = new AvroVersionedGenericSerializer(versions); } else keySerializer = new AvroVersionedGenericSerializer( keySerializerDefinition.getCurrentSchemaInfo()); if (valueSerializerDefinition.hasVersion()) { Map<Integer, String> versions = new HashMap<Integer, String>(); for (Map.Entry<Integer, String> entry : valueSerializerDefinition.getAllSchemaInfoVersions() .entrySet()) versions.put(entry.getKey(), entry.getValue()); valueSerializer = new AvroVersionedGenericSerializer(versions); } else valueSerializer = new AvroVersionedGenericSerializer( valueSerializerDefinition.getCurrentSchemaInfo()); } } catch (Exception e) { throw new RuntimeException(e); } keyCompressor = new CompressionStrategyFactory().get(keySerializerDefinition.getCompression()); valueCompressor = new CompressionStrategyFactory().get(valueSerializerDefinition.getCompression()); routingStrategy = new ConsistentRoutingStrategy(getCluster().getNodes(), getStoreDef().getReplicationFactor()); Props props = HadoopUtils.getPropsFromJob(conf); }
From source file:voldemort.store.readonly.mr.AvroStoreBuilderPartitioner.java
License:Apache License
@Override public void configure(JobConf conf) { this.cluster = new ClusterMapper().readCluster(new StringReader(conf.get("cluster.xml"))); List<StoreDefinition> storeDefs = new StoreDefinitionsMapper() .readStoreList(new StringReader(conf.get("stores.xml"))); if (storeDefs.size() != 1) throw new IllegalStateException("Expected to find only a single store, but found multiple!"); this.storeDef = storeDefs.get(0); this.numChunks = conf.getInt("num.chunks", -1); if (this.numChunks < 1) throw new VoldemortException("num.chunks not specified in the job conf."); this.saveKeys = conf.getBoolean("save.keys", false); this.reducerPerBucket = conf.getBoolean("reducer.per.bucket", false); }
From source file:voldemort.store.readwrite.mr.AbstractRWHadoopStoreBuilderMapper.java
License:Apache License
@Override @SuppressWarnings("unchecked") public void configure(JobConf conf) { super.configure(conf); md5er = ByteUtils.getDigest("md5"); keySerializerDefinition = getStoreDef().getKeySerializer(); valueSerializerDefinition = getStoreDef().getValueSerializer(); try {//www . ja v a 2 s . c o m SerializerFactory factory = new DefaultSerializerFactory(); if (conf.get("serializer.factory") != null) { factory = (SerializerFactory) Class.forName(conf.get("serializer.factory")).newInstance(); } keySerializer = (Serializer<Object>) factory.getSerializer(keySerializerDefinition); valueSerializer = (Serializer<Object>) factory.getSerializer(valueSerializerDefinition); } catch (Exception e) { throw new RuntimeException(e); } keyCompressor = new CompressionStrategyFactory().get(keySerializerDefinition.getCompression()); valueCompressor = new CompressionStrategyFactory().get(valueSerializerDefinition.getCompression()); RoutingStrategyFactory factory = new RoutingStrategyFactory(); routingStrategy = factory.updateRoutingStrategy(getStoreDef(), getCluster()); vectorNodeId = conf.getInt("vector.node.id", -1); vectorNodeVersion = conf.getLong("vector.node.version", 1L); jobStartTime = conf.getLong("job.start.time.ms", -1); if (jobStartTime < 0) { throw new RuntimeException("Incorrect job start time"); } }