List of usage examples for org.apache.hadoop.mapred JobConf setJarByClass
public void setJarByClass(Class cls)
From source file:edu.ucsb.cs.hybrid.HybridDriver.java
License:Apache License
public static void main(String args[]) throws ParseException, IOException { // job.set("mapred.job.tracker", "local"); // job.set("fs.default.name", "file:///"); JobConf job = new JobConf(); job.setJarByClass(HybridDriver.class); new GenericOptionsParser(job, args); setMapperAndRunner(job);// w w w. j av a2 s .c o m job.setMapOutputKeyClass(DocDocWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setNumReduceTasks(0); job.setOutputKeyClass(DocDocWritable.class); job.setOutputValueClass(FloatWritable.class); Path inputPath = new Path(INPUT_DIR); CustomSequenceFileInputFormat.addInputPath(job, inputPath); Path outputPath = new Path(OUTPUT_DIR); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); FileSystem.get(job).delete(outputPath, true); job.setBoolean("fs.hdfs.impl.disable.cache", true); //xun not sure if needed if (job.getBoolean(Config.SPLITABLE_PROPERTY, Config.SPLITABLE_VALUE)) { job.setInputFormat(CustomSequenceFileInputFormat.class); Long splitMB = job.getLong(Config.SPLIT_MB_PROPERTY, Config.SPLIT_MB_VALUE) * 1024 * 1024; job.setLong("mapred.min.split.size", splitMB); job.setLong("mapred.max.split.size", splitMB); job.setLong("dfs.block.size", splitMB); } else { // Comment the following of splitter for www experiments it assumes no splitting // of partitions for load balancing, should be fixed. Splitter.configure(job, inputPath);// remove comment unless for www job.setInputFormat(NonSplitableSequenceInputFormat.class); //remove comment } //SIGIR'14 two-stage balancing //not yet fully incorporated if (job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE) != 0) { TwoStageLoadbalancing.main(job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE), new Path(PartDriver.OUTPUT_DIR), job); } JobSubmitter.run(job, "SIMILARITY", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); if (job.getBoolean(Config.CONVERT_TEXT_PROPERTY, Config.CONVERT_TEXT_VALUE)) IDMappingJob(args); }
From source file:edu.ucsb.cs.hybrid.HybridDriver.java
License:Apache License
public static void IDMappingJob(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJarByClass(HybridDriver.class); job.setJobName("Converting binary similarity scores to text"); job.setMapperClass(IDMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0);//from ww w. j av a 2s .co m job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); Path inputPath = new Path(OUTPUT_DIR); job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(job, inputPath); Path outputPath = new Path("SimilarityScores"); job.setOutputFormat(TextOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); FileSystem.get(job).delete(outputPath, true); HashPagesDriver.prepareDistribCache(job, HashPagesDriver.IDS_FILE2); //remove not sure JobSubmitter.run(job, "BINARY TO TEXT", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); }
From source file:edu.ucsb.cs.knn.KnnDriver.java
License:Apache License
/** * Submit the configured job to Hadoop JobTracker to start the process. *///from w ww .ja v a 2 s . c o m public static void run(JobConf job) throws IOException { job.setJarByClass(KnnDriver.class); // This method sets the jar String ret = stars() + "\nKnnDriver(" + job.getJobName() + ")\n" + " Input Path: {"; Path inputs[] = FileInputFormat.getInputPaths(job); for (int ctr = 0; ctr < inputs.length; ctr++) { if (ctr > 0) { ret += "\n "; } ret += inputs[ctr].toString(); } ret += "}\n"; ret += " Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + " Map Tasks: " + job.getNumMapTasks() + "\n" + " Reduce Tasks: " + job.getNumReduceTasks() + "\n"; ret += " Threshold: " + job.getFloat(THRESHOLD_PROPERTY, THRESHOLD_VALUE) + "\n"; System.out.println(ret); // // run job // JobClient.runJob(job); }
From source file:edu.ucsb.cs.lsh.minhash.MinHashLshDriver.java
License:Apache License
public static void main(String args[]) throws ParseException, IOException { JobConf job = new JobConf(); job.setJarByClass(MinHashLshDriver.class); job.setJobName(MinHashLshDriver.class.getSimpleName()); GenericOptionsParser gop = new GenericOptionsParser(job, args); args = gop.getRemainingArgs();/* ww w. j av a 2s .com*/ job.setMapperClass(LshMapper.class); job.setMapOutputKeyClass(IntArrayWritable.class); // signatures job.setMapOutputValueClass(LongWritable.class); // doc IDs job.setNumReduceTasks(job.getInt(NUM_REDUCERS_PROPERTY, NUM_REDUCERS_VALUE)); job.setReducerClass(LshReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); String inputDir = args[0]; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input directory not set."); } FileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path("lsh-jaccard-buckets"); FileOutputFormat.setOutputPath(job, outputPath); FileSystem.get(job).delete(outputPath, true); LshTable lshTable = new LshTable(job.getInt(K_PROPERTY, K_VALUE), job.getInt(L_PROPERTY, L_VALUE), 1024, job.getLong(NUM_FEATURES_PROPERTY, NUM_FEATURES_VALUE), job.getFloat(THRESHOLD_PROPERTY, THRESHOLD_VALUE)); writeLsh(job, outputPath.getFileSystem(job), lshTable); JobSubmitter.run(job, "LSH", job.getFloat(THRESHOLD_PROPERTY, THRESHOLD_VALUE)); }
From source file:edu.ucsb.cs.partitioning.cosine.CosineAllPartitionMain.java
License:Apache License
/** * Job3: Core Cosine partitioning with skipping based on partition maximum * vectors length, size and weight./*from www. j a va2 s. co m*/ */ public static JobConf runCosinePartition(JobConf job, String[] args) throws IOException { new GenericOptionsParser(job, args); job.setJobName(Partitioner.class.getSimpleName() + " + " + CosineAllPartitionMain.class.getSimpleName()); job.setJarByClass(CosineAllPartitionMain.class); job = setMapReduce(job, CosineAllPartMapper.class, IdentityReducer.class); job = setInputOutput(job, new Path(Partitioner.OUTPUT_DIR), interPath); JobSubmitter.run(job, "Cosine Partitioning", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); FileSystem.get(job).delete(new Path(Partitioner.OUTPUT_DIR), true); return job; }
From source file:edu.ucsb.cs.partitioning.cosine.HolderCosinePartitionMain.java
License:Apache License
/** * Executes four jobs. Job1 does norm-sorting. Job2 is a regular java * program to uniformly partitions the records. Job3 runs the cosine * partitioner and finally Job4 is the organizer to rename files into Gij * and remove unnecessary partitioning through merging. *///from w w w . j av a 2s. c om public static void main(String[] args) throws IOException { runSort(args, "normsort"); // sorts based on p-norm //remove comments JobConf job = runUniformPartition(args, 1);// remove comment runCosinePartition(job, args, HolderCosinePartitionMain.class, CosineNormPartMapper.class); rewritePartitions(job); // rename to job !! remove comment job.setJarByClass(HolderCosinePartitionMain.class); }
From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java
License:Apache License
/** * Uniformly partitions the sequence vectors given the number of partitions * input in the configuration file. It also prepares partitions information * about its partitions: maximum p-norms, weights or norm/weights/lengths in * a file to guide the core static partitioning next for skipping. * /* w w w . ja v a2s. c o m*/ * @param norm_weight_all * @return */ public static JobConf main(String[] args, int norm_weight_all) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJarByClass(Partitioner.class); System.out.println( JobSubmitter.stars() + "\n Running partitioner to prepare uniform partitionins (Single JVM) "); String inputDir = SortDriver.OUTPUT_DIR, maxDir; if (norm_weight_all == 1) maxDir = "/maxpnorm"; // maxDir = inputDir + "/maxpnorm"; else if (norm_weight_all == 2) maxDir = "/maxweight"; // maxDir = inputDir + "/maxweight"; else maxDir = "/maxall"; // maxDir = inputDir + "/maxall"; if (!(new Path(inputDir).getFileSystem(job)).exists(new Path(inputDir))) throw new UnsupportedOperationException("ERROR: " + inputDir + " directory not set."); job.set(MAX_DIR_PATH, maxDir); job.set(Config.NUM_PARTITIONS_PROPERTY, Integer.toString(produceStaticParitions(job, inputDir, OUTPUT_DIR, maxDir, job.getInt(Config.NUM_PARTITIONS_PROPERTY, Config.NUM_PARTITIONS_VALUE), norm_weight_all))); return job; }
From source file:edu.ucsb.cs.partitioning.jaccard.JaccardCoarsePartitionMain.java
License:Apache License
public static void main(String[] args) throws IOException { runSort(args, "lengthsort"); JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJobName(JaccardCoarsePartitionMain.class.getSimpleName()); job.setJarByClass(JaccardCoarsePartitionMain.class); ///*from w w w .j a v a 2s. co m*/ // set input & output & threshold & numPartitions // String inputDir = PartDriver.INPUT_DIR; //String inputDir = SortDriver.OUTPUT_DIR; FileSystem.get(job).delete(new Path(PartDriver.OUTPUT_DIR), true); float threshold = job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE); int nPartitions = job.getInt(Config.NUM_PARTITIONS_PROPERTY, Config.NUM_PARTITIONS_VALUE); // // run regular java program // System.out.println(JobSubmitter.stars() + "\n Running Sequential Job: jaccard coarse 1D partitioning " + "\n Threshold: " + threshold); FileSystem hdfs = produceStaticParitions(inputDir, PartDriver.OUTPUT_DIR, nPartitions); produceSkipList(true, threshold, nPartitions, hdfs, job); Collector.printJaccardStatistics(job, PartDriver.OUTPUT_DIR); }
From source file:edu.ucsb.cs.partitioning.lsh.LshPartitionMain.java
License:Apache License
public static void main(String args[]) throws ParseException, IOException { JobConf job = new JobConf(); job.setJarByClass(LshPartitionMain.class); job.setJobName(LshPartitionMain.class.getSimpleName()); GenericOptionsParser gop = new GenericOptionsParser(job, args); args = gop.getRemainingArgs();//from w w w . j av a 2s.c o m job.setMapperClass(LshMapper.class); job.setMapOutputKeyClass(IntArrayWritable.class); // signatures job.setMapOutputValueClass(LongWritable.class); // doc IDs job.setNumReduceTasks(job.getInt(NUM_REDUCERS_PROPERTY, NUM_REDUCERS_VALUE)); job.setReducerClass(LshReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); String inputDir = args[0]; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input directory not set."); } FileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path("lsh-jaccard-buckets"); FileOutputFormat.setOutputPath(job, outputPath); FileSystem.get(job).delete(outputPath, true); LshTable lshTable = new LshTable(job.getInt(K_PROPERTY, K_VALUE), job.getInt(L_PROPERTY, L_VALUE), 1024, job.getLong(NUM_FEATURES_PROPERTY, NUM_FEATURES_VALUE), job.getFloat(THRESHOLD_PROPERTY, THRESHOLD_VALUE)); writeLsh(job, outputPath.getFileSystem(job), lshTable); run(job); }
From source file:edu.ucsb.cs.preprocessing.sequence.SeqWriter.java
License:Apache License
/** * Runs a MR job with maps only to convert input directory of numeric valued * records to hadoop sequence format. It assumes a text input of format of * [id feature weight ..] to be the format of input. *//*from w w w .j ava 2 s . c o m*/ public static void writeSequence() throws IOException { JobConf job = new JobConf(); job.setJobName("Convert text vectors to hadoop seqeunce "); job.setJarByClass(SeqWriter.class); job.setMapperClass(SeqMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(FeatureWeightArrayWritable.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); job.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path(INPUT_DIR)); FileSystem.get(job).delete(new Path(HashPagesDriver.IDS_FILE2), true); Path outputPath = new Path(OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); JobSubmitter.run(job, "PREPROCESS", -1); }