List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:com.splicemachine.orc.TestCachingOrcDataSource.java
License:Open Source License
private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, Compression compression, ObjectInspector columnObjectInspector) throws IOException { JobConf jobConf = new JobConf(); jobConf.set("hive.exec.orc.write.format", format == ORC_12 ? "0.12" : "0.11"); jobConf.set("hive.exec.orc.default.compress", compression.name()); Properties tableProperties = new Properties(); tableProperties.setProperty("columns", "test"); tableProperties.setProperty("columns.types", columnObjectInspector.getTypeName()); tableProperties.setProperty("orc.stripe.size", "1200000"); return new OrcOutputFormat().getHiveRecordWriter(jobConf, new Path(outputFile.toURI()), Text.class, compression != NONE, tableProperties, () -> { });// w w w. j a v a 2s .c o m }
From source file:com.spotify.hdfs2cass.BulkLoader.java
License:Apache License
public int run(String[] args) throws Exception { CommandLine cmdLine = parseOptions(args); String[] inputPaths = cmdLine.getOptionValues('i'); String seedNodeHost = cmdLine.getOptionValue('h'); String seedNodePort = cmdLine.getOptionValue('p', "9160"); String keyspace = cmdLine.getOptionValue('k'); String colfamily = cmdLine.getOptionValue('c'); int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0")); Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0")); String poolName = cmdLine.getOptionValue("pool"); ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort); clusterInfo.init(keyspace);/*from w w w . ja va 2s. co m*/ final String partitionerClass = clusterInfo.getPartitionerClass(); final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")), clusterInfo.getNumClusterNodes()); Configuration conf = new Configuration(); ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily); ConfigHelper.setOutputInitialAddress(conf, seedNodeHost); ConfigHelper.setOutputRpcPort(conf, seedNodePort); ConfigHelper.setOutputPartitioner(conf, partitionerClass); if (cmdLine.hasOption('s')) { conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32")); } if (cmdLine.hasOption('M')) { conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M')); } if (cmdLine.hasOption('C')) { ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C')); } if (cmdLine.hasOption('b')) { conf.setBoolean("com.spotify.hdfs2cass.base64", true); } JobConf job = new JobConf(conf); if (mappers > 0) job.setNumMapTasks(mappers); if (reducers > 0) job.setNumReduceTasks(reducers); if (copiers > 0) job.set("mapred.reduce.parallel.copies", copiers.toString()); if (poolName != null) job.set("mapred.fairscheduler.pool", poolName); // set the nodes as a param for the other hadoop nodes clusterInfo.setConf(job); String jobName = "bulkloader-hdfs-to-cassandra"; if (cmdLine.hasOption('n')) jobName += "-" + cmdLine.getOptionValue('n'); job.setJobName(jobName); job.setJarByClass(BulkLoader.class); job.setInputFormat(AvroAsTextInputFormat.class); for (String inputPath : inputPaths) { FileInputFormat.addInputPath(job, new Path(inputPath)); } //map just outputs text, reduce sends to cassandra job.setMapperClass(MapToText.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(CassandraPartitioner.class); job.setReducerClass(ReduceTextToCassandra.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); if (cmdLine.hasOption('s')) job.setOutputFormat(BulkOutputFormat.class); else job.setOutputFormat(ColumnFamilyOutputFormat.class); JobClient.runJob(job); return 0; }
From source file:com.spotify.hdfs2cass.CassandraPartitionerTest.java
License:Apache License
@Test public void testGetPartition() throws Exception { final int maxNodes = 5; final List<String> tokenRanges = new ArrayList<String>(); BigInteger start = BigInteger.ZERO; BigInteger step = RandomPartitioner.MAXIMUM.divide(BigInteger.valueOf(maxNodes)); for (int i = 0; i < maxNodes - 1; i++) { BigInteger end = start.add(step); tokenRanges.add(String.format("%d:%d", start, end)); start = end.add(BigInteger.ONE); }//from w w w .ja v a2 s. c o m tokenRanges.add(String.format("%d:0", start)); final JobConf conf = new JobConf(); conf.set(ClusterInfo.SPOTIFY_CASSANDRA_TOKENS_PARAM, StringUtils.join(tokenRanges, ",")); conf.set(ClusterInfo.SPOTIFY_CASSANDRA_PARTITIONER_PARAM, "org.apache.cassandra.dht.RandomPartitioner"); CassandraPartitioner instance = new CassandraPartitioner(); instance.configure(conf); Text key = new Text("foobar"); assertEquals(2, instance.getPartition(key, null, 5)); key = new Text("someotherkey"); assertEquals(1, instance.getPartition(key, null, 5)); key = new Text("1ce5cf4b861941f4aa799ae39ac9daa4"); assertEquals(4, instance.getPartition(key, null, 5)); }
From source file:com.spotify.hdfs2cass.crunch.cql.CQLTarget.java
License:Open Source License
@Override public void configureForMapReduce(final Job job, final PType<?> pType, final Path outputPath, final String name) { if (name == null) { throw new CrunchRuntimeException("'name' arguments should not be null. We don't know why tho"); }/* ww w .j a v a2s .c o m*/ FileOutputFormat.setOutputPath(job, outputPath); job.setOutputFormatClass(CrunchCqlBulkOutputFormat.class); JobConf conf = new JobConf(); params.configure(conf); for (Map.Entry<String, String> e : extraConf.entrySet()) { conf.set(e.getKey(), e.getValue()); } FormatBundle<CrunchCqlBulkOutputFormat> bundle = FormatBundle.forOutput(CrunchCqlBulkOutputFormat.class); for (Map.Entry<String, String> e : conf) { bundle.set(e.getKey(), e.getValue()); } Configuration jobConfiguration = job.getConfiguration(); // we don't know why exactly this is needed, but without this, the actual streaming will not // see the the throttling and buffer size arguments params.configure(jobConfiguration); CrunchConfigHelper.setOutputColumnFamily(jobConfiguration, params.getKeyspace(), params.getColumnFamily()); CrunchCqlBulkOutputFormat.setColumnFamilySchema(jobConfiguration, params.getColumnFamily(), params.getSchema()); CrunchCqlBulkOutputFormat.setColumnFamilyInsertStatement(jobConfiguration, params.getColumnFamily(), params.getStatement()); String[] colNames = params.getColumnNames(); for (int i = 0; i < colNames.length; i++) { CrunchCqlBulkOutputFormat.setColumnIndex(jobConfiguration, params.getColumnFamily(), colNames[i], i); } CrunchOutputs.addNamedOutput(job, name, bundle, ByteBuffer.class, List.class); }
From source file:com.spotify.hdfs2cass.crunch.thrift.ThriftTarget.java
License:Open Source License
@Override public void configureForMapReduce(final Job job, final PType<?> pType, final Path outputPath, final String name) { if (name == null) { throw new CrunchRuntimeException("'name' arguments should not be null. We don't know why tho"); }/*from www.j a va 2 s.com*/ FileOutputFormat.setOutputPath(job, outputPath); job.setOutputFormatClass(CrunchBulkOutputFormat.class); JobConf conf = new JobConf(); params.configure(conf); for (Map.Entry<String, String> e : extraConf.entrySet()) { conf.set(e.getKey(), e.getValue()); } FormatBundle<CrunchBulkOutputFormat> bundle = FormatBundle.forOutput(CrunchBulkOutputFormat.class); for (Map.Entry<String, String> e : conf) { bundle.set(e.getKey(), e.getValue()); } Configuration jobConfiguration = job.getConfiguration(); // we don't know why exactly this is needed, but without this, the actual streaming will not // see the the throttling and buffer size arguments params.configure(jobConfiguration); CrunchConfigHelper.setOutputColumnFamily(jobConfiguration, params.getKeyspace(), params.getColumnFamily()); CrunchOutputs.addNamedOutput(job, name, bundle, ByteBuffer.class, List.class); }
From source file:com.spotify.hdfs2cass.misc.ClusterInfo.java
License:Apache License
/** * Adds the tokens ranges so that individual nodes know about the Cassandra layout * * @param conf The config object// w ww . ja va 2 s.c o m * @return true if the Cassandra cluster could be queried, false otherwise */ public boolean setConf(final JobConf conf) { if (hasTokenRanges()) { final String tokenRangesStr = StringUtils.join(getTokenRanges(), ","); conf.set(SPOTIFY_CASSANDRA_TOKENS_PARAM, tokenRangesStr); conf.set(SPOTIFY_CASSANDRA_PARTITIONER_PARAM, partitionerClass); return true; } return false; }
From source file:com.test.hadoop.JhhSort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.//from w ww.j ava 2 s . c o m * * @throws IOException * When there is communication problems with the job tracker. */ @SuppressWarnings({ "rawtypes" }) public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), JhhSort.class); jobConf.setJobName("sorter"); jobConf.set("mapred.job.tracker", "192.168.12.200:9001"); jobConf.set("fs.default.name", "hdfs://192.168.12.200:9000"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.5); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = TextInputFormat.class; Class<? extends OutputFormat> outputFormatClass = TextOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = LongWritable.class; Class<? extends Writable> outputValueClass = LongWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.test.hadoop.JhhSum.java
License:Apache License
@SuppressWarnings({ "rawtypes" }) public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), JhhSum.class); jobConf.setJobName("sum"); jobConf.set("mapred.job.tracker", "192.168.12.200:9001"); jobConf.set("fs.default.name", "hdfs://192.168.12.200:9000"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(LongSumReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.5); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); }//from ww w.ja va 2 s. co m Class<? extends InputFormat> inputFormatClass = JhhInputFormat.class; Class<? extends OutputFormat> outputFormatClass = TextOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = Text.class; Class<? extends Writable> outputValueClass = LongWritable.class; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { otherArgs.add(args[i]); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.test.PiEstimatorKrb.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi//w w w .ja v a2 s . c o m */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { //setup job conf jobConf.setJobName(PiEstimatorKrb.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } sLogger.info("Wrote input for Map #" + i); } //start a map/reduce job sLogger.info("Starting Job"); final long startTime = System.currentTimeMillis(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; sLogger.info("Job Finished in " + duration + " seconds"); //read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } //compute estimated value return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
From source file:com.trace.hadoop.examples.Grep.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return -1; }//from ww w .j a v a 2s . c o m Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf grepJob = new JobConf(getConf(), Grep.class); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.set("mapred.mapper.regex", args[2]); if (args.length == 4) grepJob.set("mapred.mapper.regex.group", args[3]); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormat(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); JobClient.runJob(grepJob); JobConf sortJob = new JobConf(getConf(), Grep.class); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormat(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setOutputKeyComparatorClass // sort by decreasing freq (LongWritable.DecreasingComparator.class); JobClient.runJob(sortJob); } finally { FileSystem.get(grepJob).delete(tempDir, true); } return 0; }