List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.bah.applefox.main.plugins.webcrawler.WebCrawler.java
License:Apache License
/** * run takes the comandline args as arguments (in this case from a * configuration file), creates a new job, configures it, initiates it, * waits for completion, and returns 0 if it is successful (1 if it is not) * //w w w.j a va 2 s .c o m * @param args * the commandline arguments (in this case from a configuration * file) * * @return 0 if the job ran successfully and 1 if it isn't */ public int run(String[] args) throws Exception { userAgent = args[6]; String jobName = this.getClass().getSimpleName() + "_" + System.currentTimeMillis(); Job job = new Job(getConf(), jobName); job.setJarByClass(this.getClass()); String clone = args[5]; String clone2 = args[12]; table = clone; AccumuloUtils.setSplitSize(args[24]); table2 = clone2 + "From"; table3 = clone2 + "To"; job.setInputFormatClass(AccumuloInputFormat.class); InputFormatBase.setZooKeeperInstance(job.getConfiguration(), args[0], args[1]); InputFormatBase.setInputInfo(job.getConfiguration(), args[2], args[3].getBytes(), clone, new Authorizations()); job.setMapperClass(MapperClass.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setNumReduceTasks(0); job.setOutputFormatClass(NullOutputFormat.class); job.setOutputKeyClass(Key.class); job.setOutputValueClass(Value.class); AccumuloOutputFormat.setZooKeeperInstance(job.getConfiguration(), args[0], args[1]); AccumuloOutputFormat.setOutputInfo(job.getConfiguration(), args[2], args[3].getBytes(), true, clone); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }
From source file:com.bark.hadoop.lab3.PageRank.java
@Override public int run(String args[]) { String tmp = "/tmp/" + new Date().getTime(); // long timeStamp = new Date().getTime(); try {/* www . jav a 2s . c om*/ /** * Job 1: Parse XML input and read title,links */ Configuration conf = new Configuration(); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); Job job = Job.getInstance(conf); job.setJarByClass(PageRank.class); // specify a mapper job.setMapperClass(RedLinkMapper.class); // specify a reducer job.setReducerClass(RedLinkReducer.class); // specify output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(XmlInputFormat.class); FileOutputFormat.setOutputPath(job, new Path((args[1] + tmp + "/job1"))); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job1."); return 2; } /** * Job 2: Adjacency outGraph */ try { Configuration conf2 = new Configuration(); Job job2 = Job.getInstance(conf2); job2.setJarByClass(PageRank.class); // specify a mapper job2.setMapperClass(AdjMapper.class); // specify a reducer job2.setReducerClass(AdjReducer.class); // specify output types job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job2, new Path((args[1] + tmp + "/job1"))); job2.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job2, new Path((args[1] + tmp + "/job2"))); job2.setOutputFormatClass(TextOutputFormat.class); job2.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job2."); return 2; } /** * Job 3: PageCount */ try { Configuration conf3 = new Configuration(); /** * Change output separator to "=" instead of default \t for this job */ conf3.set("mapreduce.output.textoutputformat.separator", "="); Job job3 = Job.getInstance(conf3); job3.setJarByClass(PageRank.class); // specify a mapper job3.setMapperClass(PageCountMapper.class); // specify a reducer job3.setReducerClass(PageCountReducer.class); // specify output types job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(IntWritable.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job3, new Path((args[1] + tmp + "/job2"))); job3.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job3, new Path((args[1] + tmp + "/job3"))); job3.setOutputFormatClass(TextOutputFormat.class); job3.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job3."); return 2; } /** * Job 4: PageRank */ for (int i = 1; i < 9; i++) { try { Configuration conf4 = new Configuration(); /** * Read number of nodes from the output of job 3 : pageCount */ Path path = new Path((args[1] + tmp + "/job3")); FileSystem fs = path.getFileSystem(conf4); RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true); int n = 0; Pattern pt = Pattern.compile("(\\d+)"); while (ri.hasNext()) { LocatedFileStatus lfs = ri.next(); if (lfs.isFile() && n == 0) { FSDataInputStream inputStream = fs.open(lfs.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); String s = null; while ((s = br.readLine()) != null) { Matcher mt = pt.matcher(s); if (mt.find()) { n = new Integer(mt.group(1)); break; } } } } /** * Done reading number of nodes, make it available to MapReduce * job key: N */ conf4.setInt("N", n); Job job4 = Job.getInstance(conf4); job4.setJarByClass(PageRank.class); // specify a mapper job4.setMapperClass(PageRankMapper.class); // specify a reducer job4.setReducerClass(PageRankReducer.class); // specify output types job4.setOutputKeyClass(Text.class); job4.setOutputValueClass(Text.class); // specify input and output DIRECTORIES if (i == 1) { FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job2"))); } else { FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job4/" + (i - 1)))); } job4.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job4, new Path((args[1] + tmp + "/job4/" + i))); job4.setOutputFormatClass(TextOutputFormat.class); job4.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job4."); return 2; } } /** * Job 5: Sort iteration 1 and iteration 8 */ int returnCode = 0; for (int i = 0; i < 2; i++) { try { Configuration conf5 = new Configuration(); /** * Read number of nodes from the output of job 3 : pageCount */ Path path = new Path((args[1] + tmp + "/job3")); FileSystem fs = path.getFileSystem(conf5); RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true); int n = 0; Pattern pt = Pattern.compile("(\\d+)"); while (ri.hasNext()) { LocatedFileStatus lfs = ri.next(); if (lfs.isFile() && n == 0) { FSDataInputStream inputStream = fs.open(lfs.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); String s = null; while ((s = br.readLine()) != null) { Matcher mt = pt.matcher(s); if (mt.find()) { n = new Integer(mt.group(1)); break; } } } } /** * Done reading number of nodes, make it available to MapReduce * job key: N */ conf5.setInt("N", n); Job job5 = Job.getInstance(conf5); /** * one reducer only */ job5.setNumReduceTasks(1); job5.setSortComparatorClass(MyWritableComparator.class); job5.setJarByClass(PageRank.class); // specify a mapper job5.setMapperClass(SortMapper.class); job5.setMapOutputKeyClass(DoubleWritable.class); job5.setMapOutputValueClass(Text.class); // specify a reducer job5.setReducerClass(SortReducer.class); // specify output types job5.setOutputKeyClass(Text.class); job5.setOutputValueClass(DoubleWritable.class); // specify input and output DIRECTORIES int y = 7 * i + 1; FileInputFormat.addInputPath(job5, new Path((args[1] + tmp + "/job4/" + y))); job5.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job5, new Path((args[1] + tmp + "/job5/" + y))); job5.setOutputFormatClass(TextOutputFormat.class); returnCode = job5.waitForCompletion(true) ? 0 : 1; } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job5."); return 2; } } /** * Copy necessary output files to args[1] /** * Copy necessary output files to args[1] */ /** * Rename and copy OutLinkGraph */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job2/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.outlink.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy total number of pages */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job3/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.n.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy iteration 1 */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job5/1/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.iter1.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy iteration 8 */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job5/8/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.iter8.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } return returnCode; }
From source file:com.basho.riak.hadoop.RiakWordCount.java
License:Apache License
public int run(String[] args) throws Exception { String[] keys = new String[10000]; for (int i = 0; i < 10000; i++) { keys[i] = String.valueOf(i + 1000); }//from w w w. j a v a 2 s .c om Configuration conf = getConf(); conf = RiakConfig.setKeyLister(conf, new BucketKeyLister("wordcount")); conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 11087)); conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 12087)); conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 13087)); conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 14087)); conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 15087)); conf = RiakConfig.setOutputBucket(conf, "wordcount_out"); conf = RiakConfig.setHadoopClusterSize(conf, 4); Job job = new Job(conf, "Riak-WordCount"); job.setJarByClass(RiakWordCount.class); job.setInputFormatClass(RiakInputFormat.class); job.setMapperClass(TokenCounterMapper.class); job.setReducerClass(TokenCounterReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(RiakOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(WordCountResult.class); job.setNumReduceTasks(4); job.submit(); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.baynote.kafka.hadoop.KafkaJobBuilder.java
License:Apache License
/** * Creates a {@link Job} based on how {@code this} {@link KafkaJobBuilder} has been configured. There are no * side-effects on {@code this} instance when you call this method, so you can call it multiple times. * // w w w. j a v a2 s . c o m * @param conf * the job conf. * @return a fully configured {@link Job}. * @throws Exception error * @throws IllegalArgumentException * if any required parameters are not set. */ public Job configureJob(final Configuration conf) throws Exception { validateSettings(); final Job job = Job.getInstance(conf, getDefaultedJobName()); // set queue inputs if (getQueueMappers().size() == 1) { job.setInputFormatClass(KafkaInputFormat.class); final TopicConf topicConf = Iterables.getOnlyElement(getQueueMappers()); KafkaInputFormat.setTopic(job, topicConf.getTopic()); KafkaInputFormat.setConsumerGroup(job, topicConf.getConsumerGroup()); job.setMapperClass(topicConf.getMapper()); } else { job.setInputFormatClass(MultipleKafkaInputFormat.class); for (final TopicConf topicConf : getQueueMappers()) { MultipleKafkaInputFormat.addTopic(job, topicConf.getTopic(), topicConf.getConsumerGroup(), topicConf.getMapper()); } } if (getMapOutputKeyClass() != null) { job.setMapOutputKeyClass(getMapOutputKeyClass()); } if (getMapOutputValueClass() != null) { job.setMapOutputValueClass(getMapOutputValueClass()); } if (getReducerClass() == null) { job.setNumReduceTasks(0); } else { job.setReducerClass(getReducerClass()); job.setNumReduceTasks(getNumReduceTasks()); } if (getPartitionerClass() != null) { job.setPartitionerClass(getPartitionerClass()); } // set output job.setOutputFormatClass(getOutputFormatClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); if (getOutputFormat() == SupportedOutputFormat.TEXT_FILE) { TextOutputFormat.setOutputPath(job, getDefaultedOutputPath()); } else if (getOutputFormat() == SupportedOutputFormat.SEQUENCE_FILE) { SequenceFileOutputFormat.setOutputPath(job, getDefaultedOutputPath()); } if (usingS3()) { job.getConfiguration().set("fs.s3n.awsAccessKeyId", getS3AccessKey()); job.getConfiguration().set("fs.s3n.awsSecretAccessKey", getS3SecretyKey()); job.getConfiguration().set("fs.s3.awsAccessKeyId", getS3AccessKey()); job.getConfiguration().set("fs.s3.awsSecretAccessKey", getS3SecretyKey()); } if (isLazyOutputFormat()) { LazyOutputFormat.setOutputFormatClass(job, getOutputFormatClass()); } // setup kafka input format specifics KafkaInputFormat.setZkConnect(job, getZkConnect()); KafkaInputFormat.setKafkaFetchSizeBytes(job, getKafkaFetchSizeBytes()); job.setSpeculativeExecution(false); job.setJarByClass(getClass()); // memory settings for mappers if (!Strings.isNullOrEmpty(getTaskMemorySettings())) { job.getConfiguration().set("mapred.child.java.opts", getTaskMemorySettings()); } return job; }
From source file:com.benchmark.mapred.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try {// w w w. j a v a2 s.c om if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); System.err.println("Usage: wordcount <numReduces> <in> <out>"); System.exit(2); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); System.err.println("Usage: wordcount <numReduces> <in> <out>"); System.exit(2); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); System.err.println("Usage: wordcount <numReduces> <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(job, new Path(other_args.get(0))); FileOutputFormat.setOutputPath(job, new Path(other_args.get(1))); Date startIteration = new Date(); Boolean waitforCompletion = job.waitForCompletion(true); Date endIteration = new Date(); System.out.println( "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds."); System.exit(waitforCompletion ? 0 : 1); }
From source file:com.bigdog.hadoop.mapreduce.group.GroupApp.java
public void group() throws Exception { final Configuration configuration = new Configuration(); final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), configuration); if (fileSystem.exists(new Path(OUT_PATH))) { fileSystem.delete(new Path(OUT_PATH), true); }/*from ww w. ja v a 2 s .c o m*/ final Job job = new Job(configuration, GroupApp.class.getSimpleName()); //1.1 FileInputFormat.setInputPaths(job, INPUT_PATH); //?? job.setInputFormatClass(TextInputFormat.class); //1.2Mapper job.setMapperClass(MyMapper.class); //<k2,v2> job.setMapOutputKeyClass(NewK2.class); job.setMapOutputValueClass(LongWritable.class); //1.3 job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(1); //1.4 TODO ?? job.setGroupingComparatorClass(MyGroupingComparator.class); //1.5 TODO ?? //2.2 reduce job.setReducerClass(MyReducer.class); //<k3,v3> job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(LongWritable.class); //2.3 FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); //? job.setOutputFormatClass(TextOutputFormat.class); //???JobTracker job.waitForCompletion(true); }
From source file:com.bigdog.hadoop.mapreduce.partition.KpiApp.java
public void kpi() throws Exception { final Job job = new Job(new Configuration(), KpiApp.class.getSimpleName()); job.setJarByClass(KpiApp.class); //1.1 // w ww .j a v a 2 s.c o m FileInputFormat.setInputPaths(job, INPUT_PATH); //?? job.setInputFormatClass(TextInputFormat.class); //1.2Mapper job.setMapperClass(MyMapper.class); //<k2,v2> job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(KpiWritable.class); //1.3 job.setPartitionerClass(KpiPartitioner.class); job.setNumReduceTasks(2); //1.4 TODO ?? //1.5 TODO ?? //2.2 reduce job.setReducerClass(MyReducer.class); //<k3,v3> job.setOutputKeyClass(Text.class); job.setOutputValueClass(KpiWritable.class); //2.3 FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); //? job.setOutputFormatClass(TextOutputFormat.class); //???JobTracker job.waitForCompletion(true); }
From source file:com.bizosys.hsearch.kv.indexer.KVIndexer.java
License:Apache License
/** * Given a indexing parameters it starts a indexing. * Different indexing type are://from w ww . j a v a 2 s . c o m * SF2HB = Simple File(csv,tsv) to hbase directly. * SF2HF = Simple File(csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase. * SF2MF = Simple File(csv,tsv) to MapFile (key as {@link Text} and value as {@link BytesWritable}) * MF2HB = Map File(key and value as csv,tsv) to hbase. * MF2HF = Map File(key and value as csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase. * MF2MF = Map File(key and value as csv,tsv) to MapFile(key as {@link Text} and value as {@link BytesWritable}) * HB2HB = Hbase to Hbase * HB2HF = Hbase to HFile which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase. * HB2MF = Hbase to MapFile(key as {@link Text} and value as {@link BytesWritable}) * @param args * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void execute(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length < 7) { String err = "Usage : " + KVIndexer.class + " <<Job Type(SF2HB|SF2HF|SF2MF...)>> <<Input Source>> <<Output Sink>> <<XML File Configuration>> <<Skip Header(true|false)>> <<Run KeyGeneration Job>> <<Number Of reducer>> <<Speculative Execution>> <<scanner-cache-size>> <<filter>>"; IdSearchLog.l.fatal(err); System.exit(1); } String msg = this.getClass().getName() + " > Initializing indexer job."; IdSearchLog.l.info(msg); int seq = 0; int len = args.length; String jobType = (len > seq) ? args[seq++] : ""; String inputSource = (len > seq) ? args[seq++] : ""; String outputSink = (len > seq) ? args[seq++] : "/tmp/hsearch-index"; String xmlFilePath = (len > seq) ? args[seq++] : ""; String skipHeader = (len > seq) ? args[seq++] : "false"; boolean runKeyGenJob = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : false; int numberOfReducer = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 1; boolean speculativeExecution = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : true; int scannerCacheSize = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 300; String filter = (len > seq) ? args[seq++] : ""; if (isEmpty(jobType)) { String err = this.getClass().getName() + " > Please enter Job type as one of these :\n SF2HB|SF2HF|SF2MF|MF2HB|MF2HF|MF2MF|HB2HB|HB2HF|HB2MF|IMF2HF"; System.err.println(err); throw new IOException(err); } if (isEmpty(inputSource)) { String err = this.getClass().getName() + " > Please enter input file path."; System.err.println(err); throw new IOException(err); } Configuration conf = HBaseConfiguration.create(); FieldMapping fm = createFieldMapping(conf, xmlFilePath, new StringBuilder()); outputSink = outputSink.charAt(outputSink.length() - 1) == '/' ? outputSink : outputSink + "/"; outputSink = outputSink + fm.tableName; createHBaseTable(fm); KVIndexer.FAM_NAME = fm.familyName.getBytes(); KVIndexer.FIELD_SEPARATOR = fm.fieldSeparator; conf.set(XML_FILE_PATH, xmlFilePath); conf.set(OUTPUT_FOLDER, outputSink); conf.set(SKIP_HEADER, skipHeader); conf.set(RAW_FILE_SEPATATOR, String.valueOf(fm.fieldSeparator)); Job job = Job.getInstance(conf, "com.bizosys.hsearch.kv.indexing.KVIndexer type : " + jobType + "\n" + inputSource + "\n" + outputSink); job.setJarByClass(this.getClass()); job.setNumReduceTasks(numberOfReducer); Integer jobTypeI = JobTypeMapping.get(jobType); if (jobTypeI == null) throw new IOException("Invalid Jobtype " + jobType); /** * if internal keyIndex is given then generate the keys first and then do indexing * else just run indexer by creating keys from hbase */ boolean keyGenjobStatus = false; if (-1 != fm.internalKey && runKeyGenJob) { Configuration keyGenConf = HBaseConfiguration.create(); keyGenConf.set(INPUT_SOURCE, inputSource); keyGenConf.set(XML_FILE_PATH, xmlFilePath); keyGenConf.set(OUTPUT_FOLDER, outputSink); keyGenConf.set(SKIP_HEADER, skipHeader); Job keyGenJob = Job.getInstance(keyGenConf, "Creating Keys KVKeyGenerator for " + inputSource); switch (jobTypeI) { case SF2HB: case SF2HF: case SF2MF: { FileInputFormat.addInputPath(keyGenJob, new Path(inputSource)); keyGenJob.setMapperClass(KVKeyGeneratorMapperFile.class); keyGenJob.setInputFormatClass(TextInputFormat.class); keyGenJob.setMapOutputKeyClass(Text.class); keyGenJob.setMapOutputValueClass(Text.class); keyGenJob.setReducerClass(KVKeyGeneratorReducerFile.class); keyGenJob.setNumReduceTasks(numberOfReducer); keyGenJob.setOutputKeyClass(NullWritable.class); keyGenJob.setOutputValueClass(Text.class); inputSource = outputSink + "_" + INPUTWITH_KEY; Path intermediatePath = new Path(inputSource); System.out.println("Final input path " + inputSource); FileOutputFormat.setOutputPath(keyGenJob, intermediatePath); keyGenjobStatus = keyGenJob.waitForCompletion(true); if (!keyGenjobStatus) { throw new IOException("Error in running Job for Key Generation"); } break; } case HB2HB: case HB2HF: case HB2MF: { Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); byte[] family = fm.familyName.getBytes(); for (String name : fm.nameWithField.keySet()) { Field fld = fm.nameWithField.get(name); if (!fld.isMergedKey) continue; scan.addColumn(family, fld.sourceName.trim().getBytes()); } TableMapReduceUtil.initTableMapperJob(inputSource, // input table scan, // Scan instance to control CF and attribute selection KVKeyGeneratorMapperHBase.class, // mapper class Text.class, // mapper output key ImmutableBytesWritable.class, // mapper output value keyGenJob); TableMapReduceUtil.initTableReducerJob(inputSource, // output table KVKeyGeneratorReducerHBase.class, // reducer class keyGenJob); keyGenjobStatus = keyGenJob.waitForCompletion(true); if (!keyGenjobStatus) { throw new IOException("Error in running Job for Key Generation"); } break; } default: break; } } /* * Run job based on job type eg. SF2HB,SF2MF,SF2HF etc. */ System.out.println("Sending path " + inputSource); runJob(jobTypeI, job, fm, inputSource, outputSink, scannerCacheSize, filter); }
From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java
License:Apache License
/** * Given a indexing parameters it starts a indexing. * Different indexing type are:/*from ww w.j a va 2 s .com*/ * SF2HB = Simple File(csv,tsv) to hbase directly. * SF2HF = Simple File(csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase. * SF2MF = Simple File(csv,tsv) to MapFile (key as {@link Text} and value as {@link BytesWritable}) * MF2HB = Map File(key and value as csv,tsv) to hbase. * MF2HF = Map File(key and value as csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase. * MF2MF = Map File(key and value as csv,tsv) to MapFile(key as {@link Text} and value as {@link BytesWritable}) * HB2HB = Hbase to Hbase * HB2HF = Hbase to HFile which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase. * HB2MF = Hbase to MapFile(key as {@link Text} and value as {@link BytesWritable}) * @param args * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void execute(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length < 7) { String err = "Usage : " + KVIndexer.class + " <<Job Type(SF2HB|SF2HF|SF2MF...)>> <<Input Source>> <<Output Sink>> <<XML File Configuration>> <<Skip Header(true|false)>> <<Run KeyGeneration Job>> <<Number Of reducer>> <<Speculative Execution>> <<scanner-cache-size>> <<filter>>"; IdSearchLog.l.fatal(err); System.exit(1); } String msg = this.getClass().getName() + " > Initializing indexer job."; IdSearchLog.l.info(msg); int seq = 0; int len = args.length; String jobType = (len > seq) ? args[seq++] : ""; String inputSource = (len > seq) ? args[seq++] : ""; String outputSink = (len > seq) ? args[seq++] : "/tmp/hsearch-index"; String xmlFilePath = (len > seq) ? args[seq++] : ""; String skipHeader = (len > seq) ? args[seq++] : "false"; boolean runKeyGenJob = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : false; int numberOfReducer = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 1; boolean speculativeExecution = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : true; int scannerCacheSize = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 300; String filter = (len > seq) ? args[seq++] : ""; if (isEmpty(jobType)) { String err = this.getClass().getName() + " > Please enter Job type as one of these :\n SF2HB|SF2HF|SF2MF|MF2HB|MF2HF|MF2MF|HB2HB|HB2HF|HB2MF|IMF2HF"; System.err.println(err); throw new IOException(err); } if (isEmpty(inputSource)) { String err = this.getClass().getName() + " > Please enter input file path."; System.err.println(err); throw new IOException(err); } Configuration conf = HBaseConfiguration.create(); FieldMapping fm = createFieldMapping(conf, xmlFilePath, new StringBuilder()); outputSink = outputSink.charAt(outputSink.length() - 1) == '/' ? outputSink : outputSink + "/"; outputSink = outputSink + fm.tableName; createHBaseTable(fm); KVIndexer.FAM_NAME = fm.familyName.getBytes(); KVIndexer.FIELD_SEPARATOR = fm.fieldSeparator; conf.set(XML_FILE_PATH, xmlFilePath); conf.set(OUTPUT_FOLDER, outputSink); conf.set(SKIP_HEADER, skipHeader); conf.setBoolean("mapreduce.map.speculative", speculativeExecution); Job job = Job.getInstance(conf, "com.bizosys.hsearch.kv.indexing.KVIndexer type : " + jobType + "\n" + inputSource + "\n" + outputSink); job.setJarByClass(this.getClass()); job.setNumReduceTasks(numberOfReducer); Integer jobTypeI = JobTypeMapping.get(jobType); if (jobTypeI == null) throw new IOException("Invalid Jobtype " + jobType); /** * if internal keyIndex is given then generate the keys first and then do indexing * else just run indexer by creating keys from hbase */ boolean keyGenjobStatus = false; if (-1 != fm.internalKey && runKeyGenJob) { Configuration keyGenConf = HBaseConfiguration.create(); keyGenConf.set(INPUT_SOURCE, inputSource); keyGenConf.set(XML_FILE_PATH, xmlFilePath); keyGenConf.set(OUTPUT_FOLDER, outputSink); keyGenConf.set(SKIP_HEADER, skipHeader); Job keyGenJob = Job.getInstance(keyGenConf, "Creating Keys KVKeyGenerator for " + inputSource); switch (jobTypeI) { case SF2HB: case SF2HF: case SF2MF: { FileInputFormat.addInputPath(keyGenJob, new Path(inputSource)); keyGenJob.setMapperClass(KVKeyGeneratorMapperFile.class); keyGenJob.setInputFormatClass(TextInputFormat.class); keyGenJob.setMapOutputKeyClass(Text.class); keyGenJob.setMapOutputValueClass(Text.class); keyGenJob.setReducerClass(KVKeyGeneratorReducerFile.class); keyGenJob.setNumReduceTasks(numberOfReducer); keyGenJob.setOutputKeyClass(NullWritable.class); keyGenJob.setOutputValueClass(Text.class); inputSource = outputSink + "_" + INPUTWITH_KEY; Path intermediatePath = new Path(inputSource); System.out.println("Final input path " + inputSource); FileOutputFormat.setOutputPath(keyGenJob, intermediatePath); keyGenjobStatus = keyGenJob.waitForCompletion(true); if (!keyGenjobStatus) { throw new IOException("Error in running Job for Key Generation"); } break; } case HB2HB: case HB2HF: case HB2MF: { Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); // Added Filter if (null != filter) { if (filter.trim().length() > 0) { int index = filter.indexOf('='); scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(), filter.substring(0, index).getBytes(), CompareOp.EQUAL, filter.substring(index + 1).getBytes())); } } byte[] family = fm.familyName.getBytes(); for (String name : fm.nameWithField.keySet()) { Field fld = fm.nameWithField.get(name); if (!fld.isMergedKey) continue; scan.addColumn(family, fld.sourceName.trim().getBytes()); } TableMapReduceUtil.initTableMapperJob(inputSource, // input table scan, // Scan instance to control CF and attribute selection KVKeyGeneratorMapperHBase.class, // mapper class Text.class, // mapper output key ImmutableBytesWritable.class, // mapper output value keyGenJob); TableMapReduceUtil.initTableReducerJob(inputSource, // output table KVKeyGeneratorReducerHBase.class, // reducer class keyGenJob); keyGenjobStatus = keyGenJob.waitForCompletion(true); if (!keyGenjobStatus) { throw new IOException("Error in running Job for Key Generation"); } break; } case MF2HB: case MF2HF: case MF2MF: { break; } default: break; } } /* * Run job based on job type eg. SF2HB,SF2MF,SF2HF etc. */ System.out.println("Sending path " + inputSource); runJob(jobTypeI, job, fm, inputSource, outputSink, scannerCacheSize, filter); }
From source file:com.bizosys.hsearch.kv.indexing.KVReplicatorMapFile.java
License:Apache License
@Override public int run(String[] args) throws Exception { int seq = 0;/*from ww w . java 2 s . co m*/ String inputFile = (args.length > seq) ? args[seq] : ""; seq++; String outputFile = (args.length > seq) ? args[seq++] : "/tmp/hsearch-index"; String outputFileName = (args.length > seq) ? args[seq++] : "file1"; String xmlFilePath = (args.length > seq) ? args[seq++] : ""; String replaceFrom = (args.length > seq) ? args[seq++] : ""; String replaceTo = (args.length > seq) ? args[seq++] : ""; String startIndex = (args.length > seq) ? args[seq++] : ""; String endIndex = (args.length > seq) ? args[seq++] : ""; String numberOfReducerStr = (args.length > seq) ? args[seq] : "1"; int numberOfReducer = Integer.parseInt(numberOfReducerStr); if (null == inputFile || inputFile.trim().isEmpty()) { String err = KVReplicatorHFile.class + " > Please enter input file path."; System.err.println(err); throw new IOException(err); } Configuration conf = HBaseConfiguration.create(); FieldMapping fm = KVIndexer.createFieldMapping(conf, xmlFilePath, new StringBuilder()); outputFile = outputFile.charAt(outputFile.length() - 1) == '/' ? outputFile : outputFile + "/"; outputFile = outputFile + fm.tableName; conf.set(OUTPUT_FILE_PATH, outputFile); conf.set(OUTPUT_FILE_NAME, outputFileName); conf.set(REPLACE_FROM, replaceFrom); conf.set(REPLACE_TO, replaceTo); conf.set(START_INDEX, startIndex); conf.set(END_INDEX, endIndex); Job job = Job.getInstance(conf, "KVReplicatorMapFile - Replicating Map File"); job.setJarByClass(KVReplicatorMapFile.class); job.setMapperClass(KVReplicatorMapper.class); job.setReducerClass(KVReplicatorReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setNumReduceTasks(numberOfReducer); job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputFile.trim())); FileSystem fs = FileSystem.get(conf); Path dummyPath = new Path("/tmp", "dummy"); if (fs.exists(dummyPath)) { fs.delete(dummyPath, true); } FileOutputFormat.setOutputPath(job, dummyPath); boolean result = job.waitForCompletion(true); return (result ? 0 : 1); }