List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:com.wibidata.wibidota.dotaloader.DotaValuesCounter.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Value Counter"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(Map.class); job.setCombinerClass(Add.class); job.setReducerClass(Add.class); job.setJarByClass(DotaValuesCounter.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {//from w w w .j a v a 2s .c o m return -1; } }
From source file:com.wibidata.wibidota.DotaMaxAccountId.java
License:Apache License
public final int run(final String[] args) throws Exception { Job job = new Job(super.getConf(), "Dota Max Builder"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(DotaMaxAccountId.Map.class); job.setCombinerClass(DotaMaxAccountId.TakeMax.class); job.setReducerClass(DotaMaxAccountId.TakeMax.class); job.setJarByClass(DotaMaxAccountId.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (job.waitForCompletion(true)) { return 0; } else {//from w w w .j a v a 2s .co m return -1; } }
From source file:com.wipro.ats.bdre.datagen.mr.Driver.java
License:Apache License
/** * @param args the cli arguments/*from ww w . j av a2 s.c o m*/ */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); GetGeneralConfig generalConfig = new GetGeneralConfig(); GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name"); conf.set("fs.defaultFS", gc.getDefaultVal()); String processId = args[0]; Path outputDir = new Path(ResolvePath.replaceVars(args[1])); Properties dataProps = Config.getDataProperties(processId); Properties tableProps = Config.getTableProperties(processId); TableUtil tableUtil = new TableUtil(); Table table = tableUtil.formTableFromConfig(processId); FileSystem fs = FileSystem.get(conf); LOGGER.info("Default FS =" + conf.get("fs.defaultFS")); //set in the conf for mappers to use conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator")); conf.set(Config.PID_KEY, processId); conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows"))); conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits"))); Job job = Job.getInstance(conf); Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName()); FileOutputFormat.setOutputPath(job, mrOutputPath); job.setJobName("Datagen-" + table.getTableName()); job.setJarByClass(Driver.class); job.setMapperClass(RecordGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); //merge and create a single file Path srcDir = mrOutputPath; Path destFile = new Path(outputDir.toString() + "/" + table.getTableName()); FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, ""); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash("0"); registerFileInfo.setFileSize(0L); registerFileInfo.setPath(destFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java
License:Apache License
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); String dir = conf.get(LindenJobConfig.INPUT_DIR, null); logger.info("input dir:" + dir); Path inputPath = new Path(StringUtils.unEscapeString(dir)); Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR)); String indexPath = conf.get(LindenJobConfig.INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true);//from w w w . j a va2 s . c o m } if (fs.exists(new Path(indexPath))) { fs.delete(new Path(indexPath), true); } int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1); Shard[] shards = createShards(indexPath, numShards); Shard.setIndexShards(conf, shards); //empty trash; (new Trash(conf)).expunge(); Job job = Job.getInstance(conf, "linden-hadoop-indexing"); job.setJarByClass(LindenJob.class); job.setMapperClass(LindenMapper.class); job.setCombinerClass(LindenCombiner.class); job.setReducerClass(LindenReducer.class); job.setMapOutputKeyClass(Shard.class); job.setMapOutputValueClass(IntermediateForm.class); job.setOutputKeyClass(Shard.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(IndexUpdateOutputFormat.class); job.setReduceSpeculativeExecution(false); job.setNumReduceTasks(numShards); String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL); if (lindenSchemaFile == null) { throw new IOException("no schema file is found"); } logger.info("Adding schema file: " + lindenSchemaFile); job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema")); String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL); if (lindenPropertiesFile == null) { throw new IOException("no linden properties file is found"); } logger.info("Adding linden properties file: " + lindenPropertiesFile); job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties")); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); Path[] inputs = FileInputFormat.getInputPaths(job); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapreduce.input.dir = " + buffer.toString()); logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString()); logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS)); logger.info("mapreduce.input.format.class = " + job.getInputFormatClass()); logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass()); logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR)); job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed"); } return 0; }
From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); Options options = buildOptions();/* www . java2 s . c o m*/ CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("h") || cmd.getArgs().length == 0) { printHelpAndExit(options); } String hdfsPath = cmd.getArgs()[0]; Configuration conf = getConf(); conf.setBoolean("mapred.map.tasks.speculative.execution", false); if (cmd.hasOption("topics")) { LOG.info("Using topics: " + cmd.getOptionValue("topics")); KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics")); } else { printHelpAndExit(options); } KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181")); if (cmd.hasOption("consumer-group")) { CheckpointManager.configureUseZooKeeper(conf, cmd.getOptionValue("consumer-group", "dev-hadoop-loader")); } if (cmd.getOptionValue("autooffset-reset") != null) { KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset")); } JobConf jobConf = new JobConf(conf); if (cmd.hasOption("remote")) { String ip = cmd.getOptionValue("remote"); LOG.info("Default file system: hdfs://" + ip + ":8020/"); jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/"); LOG.info("Remote jobtracker: " + ip + ":8021"); jobConf.set("mapred.job.tracker", ip + ":8021"); } Path jarTarget = new Path( getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar"); if (new File(jarTarget.toUri()).exists()) { // running from IDE/ as maven jobConf.setJar(jarTarget.toUri().getPath()); LOG.info("Using target jar: " + jarTarget.toString()); } else { // running from jar remotely or locally jobConf.setJarByClass(getClass()); LOG.info("Using parent jar: " + jobConf.getJar()); } Job job = Job.getInstance(jobConf, "kafka.hadoop.loader"); job.setInputFormatClass(KafkaInputFormat.class); job.setMapperClass(HadoopJobMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); MultiOutputFormat.setOutputPath(job, new Path(hdfsPath)); MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on")); LOG.info("Output hdfs location: {}", hdfsPath); LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job)); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.xoriant.kafkaProducer.MyConsumer.java
License:Apache License
public static void main(String[] args) throws IOException { // System.setProperty("spark.executor.memory", "8g"); System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf(); // final Configuration config = new Configuration(); Configuration hadoopConfig = new Configuration(); hadoopConfig.set("mapreduce.output.textoutputformat.separator", ","); sparkConf.setMaster("local[2]"); sparkConf.setAppName("Insurance"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(javaSparkContext, new Duration(500)); int numThreads = Integer.parseInt(args[3]); Map<String, Integer> topicMap = new HashMap<String, Integer>(); String[] topics = args[2].split(","); for (String topic : topics) { topicMap.put(topic, numThreads); }// w ww.ja v a2 s . c om // 3. create connection with HBase Configuration config = null; try { config = HBaseConfiguration.create(); config.set("hbase.zookeeper.quorum", "192.168.1.114"); config.set("hbase.zookeeper.property.clientPort", "2181"); // config.set("mapreduce.job.output.key.class", // Text.class.getName()); // config.set("mapreduce.job.output.value.class", // IntWritable.class.getName()); // config.set("mapreduce.outputformat.class" , // TableOutputFormat.class.getName()); // config.set("hbase.master", "127.0.0.1:60000"); HBaseAdmin.checkHBaseAvailable(config); System.out.println("HBase is running!"); } catch (MasterNotRunningException e) { System.out.println("HBase is not running!"); System.exit(1); } catch (Exception ce) { System.out.println("here....."); ce.printStackTrace(); } // config.set(TableInputFormat.INPUT_TABLE, rawTableName); // 4. new Hadoop API configuration final Job newAPIJobConfigurationState = Job.getInstance(config); newAPIJobConfigurationState.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, stateTable); newAPIJobConfigurationState.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class); final Job newAPIJobConfigurationUser = Job.getInstance(config); newAPIJobConfigurationUser.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "user_total_stream"); newAPIJobConfigurationUser.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class); final Job paymentHistoryConfig = Job.getInstance(config); paymentHistoryConfig.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "payment_history_stream"); paymentHistoryConfig.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class); /* * Set<String> topics = new HashSet<String>(); topics.add("test"); * * * Map<String, String> kafkaParams = new HashMap<String, String>(); * kafkaParams.put("metadata.broker.list", "10.20.0.199:9092"); */ /* * JavaPairInputDStream<String, String> stream = KafkaUtils * .createDirectStream(javaStreamingContext, String.class, String.class, * StringDecoder.class, StringDecoder.class, kafkaParams, topics); */ JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(javaStreamingContext, args[0], args[1], topicMap); System.out.println( "Got my DStream! connecting to zookeeper " + args[0] + " group " + args[1] + " topics" + topicMap); stream.count().print(); JavaDStream<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> records = stream .map(new Function<Tuple2<String, String>, Tuple11<String, String, String, String, String, String, String, String, String, String, String>>() { private static final long serialVersionUID = 1L; public Tuple11<String, String, String, String, String, String, String, String, String, String, String> call( Tuple2<String, String> defaultKeyAndRecords) throws Exception { String[] fields = defaultKeyAndRecords._2().split(","); return new Tuple11<String, String, String, String, String, String, String, String, String, String, String>( fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9], fields[10]); } }); records.foreachRDD( new Function<JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>>, Void>() { private static final long serialVersionUID = -3333697808496161495L; public Void call( JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> rdd) throws Exception { saveToHBasePaymentHistory(rdd, paymentHistoryConfig.getConfiguration()); return null; } }); JavaPairDStream<String, String> window = records.mapToPair( new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, String>() { private static final long serialVersionUID = -8849699432349098738L; public Tuple2<String, String> call( Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0) throws Exception { String str = arg0._2() + "," + arg0._3() + "," + arg0._4() + "," + arg0._5() + "," + arg0._6() + "," + arg0._7() + "," + arg0._8() + "," + arg0._9() + "," + arg0._10() + "," + arg0._11(); return new Tuple2<String, String>(arg0._1(), str); } }).window(new Duration(60000), new Duration(60000)); window.saveAsNewAPIHadoopFiles("hdfs://192.168.1.114/user/hadoop/StreamingData/Insurancedata", "", Text.class, Text.class, TextOutputFormat.class, hadoopConfig); JavaPairDStream<String, Integer> recordsMapState = records.mapToPair( new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() { private static final long serialVersionUID = 1L; public Tuple2<String, Integer> call( Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0) throws Exception { String key = arg0._10(); Integer value = new Integer(arg0._7()); return new Tuple2<String, Integer>(key, value); } }); JavaPairDStream<String, Integer> recordsMapUser = records.mapToPair( new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() { private static final long serialVersionUID = 1L; public Tuple2<String, Integer> call( Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0) throws Exception { String key = arg0._1(); Integer value = new Integer(arg0._7()); return new Tuple2<String, Integer>(key, value); } }); JavaPairDStream<String, Integer> reduceByKeyAndWindowState = recordsMapState .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 197675516004789269L; public Integer call(Integer val1, Integer val2) throws Exception { return val1 + val2; } }, new Duration(86400000), new Duration(10000)); JavaPairDStream<String, Integer> reduceByKeyAndWindowUser = recordsMapUser .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 197675516004789269L; public Integer call(Integer val1, Integer val2) throws Exception { return val1 + val2; } }, new Duration(86400000), new Duration(60000)); // reduce.count(); reduceByKeyAndWindowState.print(); reduceByKeyAndWindowState.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() { private static final long serialVersionUID = 8534726505385048702L; public Void call(JavaPairRDD<String, Integer> rdd) throws Exception { saveToHBase(rdd, newAPIJobConfigurationState.getConfiguration()); return null; } }); reduceByKeyAndWindowUser.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() { private static final long serialVersionUID = 8534726505385048702L; public Void call(JavaPairRDD<String, Integer> rdd) throws Exception { saveToHBase(rdd, newAPIJobConfigurationUser.getConfiguration()); return null; } }); javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }
From source file:com.xyz.reccommendation.driver.SKU2SKUCount.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); String envt = null;//from w w w.j a v a2 s . co m if (args.length > 0) { envt = args[0]; } else { envt = "dev"; } Properties prop = new Properties(); try { // load a properties file from class path, inside static method prop.load(SKU2SKUCount.class.getClassLoader().getResourceAsStream("config-" + envt + ".properties")); } catch (IOException ex) { ex.printStackTrace(); System.exit(1); } MongoConfigUtil.setOutputURI(conf, "mongodb://" + prop.getProperty("mongodb.ip") + "/" + prop.getProperty("mongodb.dbname") + ".out_stat_custom"); log.debug("MongoDB URL : mongodb://" + prop.getProperty("mongodb.ip") + "/" + prop.getProperty("mongodb.dbname") + "." + ".out_stat_custom"); log.debug("Conf: " + conf); MongoConfigUtil.setCreateInputSplits(conf, false); args = new GenericOptionsParser(conf, args).getRemainingArgs(); final Job job = new Job(conf, "Count the sku to sku mapping from pview data on hdfs in \"inputPview\" path."); job.setJarByClass(SKU2SKUCount.class); job.setMapperClass(TokenizerMapper.class); // job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BSONWritable.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); FileInputFormat.setInputPaths(job, new Path("inputPview")); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.yahoo.druid.hadoop.DruidInputFormatTest.java
License:Apache License
@Test public void testSampleMRJob() throws Exception { Job job = Job.getInstance(new Configuration(), "Druid-Loader-Sample-Test-Job"); job.getConfiguration().set("mapreduce.job.acl-view-job", "*"); job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:" + overlordTestPort); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA, "{" + "\"dataSource\":\"testDataSource\"," + "\"interval\":\"1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z\"," + "\"granularity\":\"NONE\"," + "\"dimensions\":[\"host\"]," + "\"metrics\":[\"visited_sum\",\"unique_hosts\"]" + "}"); job.setMapperClass(SampleMapper.class); job.setNumReduceTasks(0);/*www . java 2 s . co m*/ job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setInputFormatClass(DruidInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); String outputPath = tempFolder.newFolder() + "/out"; TextOutputFormat.setOutputPath(job, new Path(outputPath)); Assert.assertTrue(job.waitForCompletion(true)); //verify that the SampleMapper actually ran and verified the data Assert.assertTrue(FileUtils.readFileToString(new File(outputPath + "/part-m-00000")).startsWith("SUCCESS")); }
From source file:com.yahoo.druid.hadoop.example.SamplePrintMRJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { // When implementing tool Configuration conf = this.getConf(); // Create job Job job = new Job(conf, "Druid-Loader-Sample-Job"); job.setJarByClass(SamplePrintMRJob.class); // job.setJobName("Druid-Loader-Sample-Job"); job.getConfiguration().set("mapreduce.job.acl-view-job", "*"); job.getConfiguration().set("mapreduce.job.queuename", "default"); job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC"); //job.getConfiguration().set("mapreduce.map.memory.mb", "1024"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_STORAGE_STORAGE_DIR, "/tmp/druid/storage"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:8080"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_DATASOURCE, "wikipedia"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_INTERVAL, "2009-01-01T00:00:00.000/2050-01-01T00:00:00.000"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA_FILE, "/tmp/druid/schema/druid_fun_mr.json"); job.setMapperClass(DruidPrintMapper.class); job.setNumReduceTasks(0);//from ww w .j av a 2 s . co m job.setOutputKeyClass(DateTime.class); job.setOutputValueClass(Map.class); job.setInputFormatClass(DruidInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); System.out.println("Starting Druid Loader Sample Job....."); return job.waitForCompletion(true) ? 0 : 1; //System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java
License:Open Source License
public int run(String[] args) throws Exception { SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(), "Generates a keyword index from RDF data.", new Parameter[] { new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts", "Don't process the contexts for each tuple."), new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm', METHOD_ARG, "horizontal or vertical."), new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', PREDICATES_ARG, "Subset of the properties to be indexed."), new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r', RESOURCE_PREFIX_ARG, "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"), new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the input data."), new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED, "Number of documents to index"), new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the output."), new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location of the resources hash file."), }); JSAPResult jsapResult = jsap.parse(args); // check whether the command line was valid, and if it wasn't, // display usage information and exit. if (!jsapResult.success()) { System.err.println();//from ww w .j a v a2 s . co m System.err.println("Usage: java " + TripleIndexGenerator.class.getName()); System.err.println(" " + jsap.getUsage()); System.err.println(); System.exit(1); } Job job = Job.getInstance(getConf()); job.setJarByClass(TripleIndexGenerator.class); job.setJobName("TripleIndexGenerator" + System.currentTimeMillis()); FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input"))); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(DocumentMapper.class); job.setMapOutputKeyClass(TermKey.class); job.setMapOutputValueClass(TermValue.class); job.setPartitionerClass(TermKey.FirstPartitioner.class); job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class); job.setReducerClass(TermReduce.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IndexRecordWriterValue.class); job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output"))); Configuration conf = job.getConfiguration(); conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class); conf.set("mapreduce.user.classpath.first", "true"); long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG); conf.setLong(NUMBER_OF_DOCUMENTS, numDocs); // Set this in a attempt to get around the 2GB of ram task limit on our cluster. // Setting this in the hope of fixing Direct buffer memory errors conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024); conf.set(OUTPUT_DIR, jsapResult.getString("output")); boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false); if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) { HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG), jsapResult.getString(RESOURCE_PREFIX_ARG)); } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) { if (!jsapResult.contains(PREDICATES_ARG)) { throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL + "' you have to give a predicates file too."); } VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG), jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG)); } else { throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '" + METHOD_ARG_VALUE_VERTICAL + "'"); } conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }