List of usage examples for org.apache.hadoop.mapreduce.filecache DistributedCache addCacheFile
@Deprecated public static void addCacheFile(URI uri, Configuration conf)
From source file:bdss.cmu.edu.Sort.java
License:Apache License
/** * The main driver for sort program./*from www . j a va2s . c om*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = conf.get(REDUCES_PER_HOST); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs job = new Job(conf); job.setJobName("sorter"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(num_reduces); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(outputFormatClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(job, otherArgs.get(0)); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(job)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); InputSampler.<K, V>writePartitionFile(job, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, conf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }
From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java
License:Apache License
/** * Configure a MapReduce Job to perform an incremental load into the given * table. This/*from www . ja va2 s. co m*/ * <ul> * <li>Inspects the table to configure a total order partitioner</li> * <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li> * <li>Sets the number of reduce tasks to match the current number of regions</li> * <li>Sets the output key/value class to match HFileOutputFormat's requirements</li> * <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or * PutSortReducer)</li> * </ul> * The user should be sure to set the map output value class to either KeyValue or Put before * running this function. */ public static void configureIncrementalLoad(Job job, HTable table) throws IOException { Configuration conf = job.getConfiguration(); Class<? extends Partitioner> topClass; try { topClass = getTotalOrderPartitionerClass(); } catch (ClassNotFoundException e) { throw new IOException("Failed getting TotalOrderPartitioner", e); } //partition job.setPartitionerClass(topClass); //Set the key class for the job output data job.setOutputKeyClass(ImmutableBytesWritable.class); //Set the value class for job outputs job.setOutputValueClass(KeyValue.class); //outputformatHfile job.setOutputFormatClass(HFileOutputFormat2.class); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(SingleColumnReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } LOG.info("Looking up current regions for table " + table); //?regionstarkey List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); //?region?reduce? job.setNumReduceTasks(startKeys.size()); Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + UUID.randomUUID()); LOG.info("Writing partition information to " + partitionsPath); FileSystem fs = partitionsPath.getFileSystem(conf); writePartitions(conf, partitionsPath, startKeys); partitionsPath.makeQualified(fs); URI cacheUri; try { // Below we make explicit reference to the bundled TOP. Its cheating. // We are assume the define in the hbase bundled TOP is as it is in // hadoop (whether 0.20 or 0.22, etc.) /* cacheUri = new URI(partitionsPath.toString() + "#" + org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH); */ cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.addCacheFile(cacheUri, conf); DistributedCache.createSymlink(conf); // Set compression algorithms based on column families configureCompression(table, conf); TableMapReduceUtil.addDependencyJars(job); LOG.info("Incremental table output configured."); }
From source file:com.phantom.hadoop.examples.Sort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.//from www. j a va2 s . co m * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = conf.get(REDUCES_PER_HOST); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs job = new Job(conf); job.setJobName("sorter"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(num_reduces); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(outputFormatClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(job, otherArgs.get(0)); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(job)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.<K, V>writePartitionFile(job, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, conf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }
From source file:gobblin.runtime.mapreduce.MRJobLauncher.java
License:Apache License
@VisibleForTesting static void serializeJobState(FileSystem fs, Path mrJobDir, Configuration conf, JobState jobState, Job job) throws IOException { Path jobStateFilePath = new Path(mrJobDir, JOB_STATE_FILE_NAME); // Write the job state with an empty task set (work units are read by the mapper from a different file) try (DataOutputStream dataOutputStream = new DataOutputStream(fs.create(jobStateFilePath))) { jobState.write(dataOutputStream, false); }/*from www. j av a 2 s .c om*/ job.getConfiguration().set(ConfigurationKeys.JOB_STATE_FILE_PATH_KEY, jobStateFilePath.toString()); DistributedCache.addCacheFile(jobStateFilePath.toUri(), job.getConfiguration()); job.getConfiguration().set(ConfigurationKeys.JOB_STATE_DISTRIBUTED_CACHE_NAME, jobStateFilePath.getName()); }
From source file:gobblin.runtime.mapreduce.MRJobLauncher.java
License:Apache License
/** * Add local non-jar files the job depends on to DistributedCache. *//* w w w. ja v a 2 s .c o m*/ @SuppressWarnings("deprecation") private void addLocalFiles(Path jobFileDir, String jobFileList, Configuration conf) throws IOException { DistributedCache.createSymlink(conf); for (String jobFile : SPLITTER.split(jobFileList)) { Path srcJobFile = new Path(jobFile); // DistributedCache requires absolute path, so we need to use makeQualified. Path destJobFile = new Path(this.fs.makeQualified(jobFileDir), srcJobFile.getName()); // Copy the file from local file system to HDFS this.fs.copyFromLocalFile(srcJobFile, destJobFile); // Create a URI that is in the form path#symlink URI destFileUri = URI.create(destJobFile.toUri().getPath() + "#" + destJobFile.getName()); LOG.info(String.format("Adding %s to DistributedCache", destFileUri)); // Finally add the file to DistributedCache with a symlink named after the file name DistributedCache.addCacheFile(destFileUri, conf); } }
From source file:gobblin.runtime.mapreduce.MRJobLauncher.java
License:Apache License
/** * Add non-jar files already on HDFS that the job depends on to DistributedCache. *//* ww w.j av a 2 s . co m*/ @SuppressWarnings("deprecation") private void addHDFSFiles(String jobFileList, Configuration conf) { DistributedCache.createSymlink(conf); jobFileList = PasswordManager.getInstance(this.jobProps).readPassword(jobFileList); for (String jobFile : SPLITTER.split(jobFileList)) { Path srcJobFile = new Path(jobFile); // Create a URI that is in the form path#symlink URI srcFileUri = URI.create(srcJobFile.toUri().getPath() + "#" + srcJobFile.getName()); LOG.info(String.format("Adding %s to DistributedCache", srcFileUri)); // Finally add the file to DistributedCache with a symlink named after the file name DistributedCache.addCacheFile(srcFileUri, conf); } }
From source file:ipldataanalysis4.IPLDataAnalysis4.java
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.printf("Two parameters are required for Data Analysis for IPL- <input dir> <output dir>\n"); return -1; }// ww w . j av a2 s .com Configuration conf = new Configuration(); DistributedCache.addCacheFile(new URI("testFile"), conf); Job job = new Job(getConf(), "Job1"); long milliSeconds = 1000 * 60 * 60; conf.setLong("mapred.task.timeout", milliSeconds); job.setJarByClass(IPLDataAnalysis4.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(BloomMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setNumReduceTasks(0); //job.setReducerClass(DataAnalysisReducer.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:nl.sanoma.hdt.report.generator.ReportGeneratorDriver.java
License:Open Source License
/** * Job to join the data and the metadata from distributed cache and * calculate the revenue by quarter and most popular product category for user * * @param dBPath the path of the import MapFile * @param inputPath the path of the logs directory * @param outputPath the path of the output directory * @return returns the exitCode of the job * @throws IOException//from w ww .ja va 2s .com * @throws URISyntaxException * @throws InterruptedException * @throws ClassNotFoundException */ public Boolean generateReport(String dBPath, String inputPath, String outputPath) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException { Job job = new Job(getConf()); Configuration conf = job.getConfiguration(); job.setJobName("Repor Generator"); DistributedCache.addCacheFile(new URI(dBPath), conf); job.setJarByClass(ReportGeneratorDriver.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setPartitionerClass(KeyDataPartitioner.class); job.setGroupingComparatorClass(KeyDataGroupingComparator.class); job.setSortComparatorClass(KeyDataComparator.class); job.setMapperClass(ReportGeneratorMapper.class); job.setMapOutputKeyClass(KeyData.class); job.setMapOutputValueClass(ValueData.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setReducerClass(ReportGeneratorReducer.class); job.setNumReduceTasks(1); return job.waitForCompletion(true); }
From source file:org.apache.gobblin.runtime.mapreduce.MRJobLauncher.java
License:Apache License
@VisibleForTesting static void serializeJobState(FileSystem fs, Path mrJobDir, Configuration conf, JobState jobState, Job job) throws IOException { Path jobStateFilePath = new Path(mrJobDir, JOB_STATE_FILE_NAME); // Write the job state with an empty task set (work units are read by the mapper from a different file) try (DataOutputStream dataOutputStream = new DataOutputStream(fs.create(jobStateFilePath))) { jobState.write(dataOutputStream, false, conf.getBoolean(SERIALIZE_PREVIOUS_WORKUNIT_STATES_KEY, DEFAULT_SERIALIZE_PREVIOUS_WORKUNIT_STATES)); }/* ww w .j a va 2s. co m*/ job.getConfiguration().set(ConfigurationKeys.JOB_STATE_FILE_PATH_KEY, jobStateFilePath.toString()); DistributedCache.addCacheFile(jobStateFilePath.toUri(), job.getConfiguration()); job.getConfiguration().set(ConfigurationKeys.JOB_STATE_DISTRIBUTED_CACHE_NAME, jobStateFilePath.getName()); }
From source file:org.apache.sysml.runtime.matrix.CSVReblockMR.java
License:Apache License
private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception { JobConf job;//from w w w . j a v a 2 s . c o m job = new JobConf(ReblockMR.class); job.setJobName("CSV-Reblock-MR"); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL); job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, reblockInstructions, null, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed); // Print the complete instruction //if (LOG.isTraceEnabled()) // inst.printCompelteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true); // configure mapper and the mapper output key value pairs job.setMapperClass(CSVReblockMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); Path cachefile = new Path(counterFile, "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(ROWID_FILE_NAME, cachefile.toString()); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); // System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }