List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:org.apache.accumulo.examples.mapreduce.bulk.BulkIngestExample.java
License:Apache License
@Override public int run(String[] args) { Opts opts = new Opts(); opts.parseArgs(BulkIngestExample.class.getName(), args); Configuration conf = getConf(); PrintStream out = null;//from w w w . ja v a 2 s. c o m try { Job job = Job.getInstance(conf); job.setJobName("bulk ingest example"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MapClass.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(ReduceClass.class); job.setOutputFormatClass(AccumuloFileOutputFormat.class); opts.setAccumuloConfigs(job); Connector connector = opts.getConnector(); TextInputFormat.setInputPaths(job, new Path(opts.inputDir)); AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files")); FileSystem fs = FileSystem.get(conf); out = new PrintStream(new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt")))); Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100); for (Text split : splits) out.println(Base64.getEncoder().encodeToString(TextUtil.getBytes(split))); job.setNumReduceTasks(splits.size() + 1); out.close(); job.setPartitionerClass(RangePartitioner.class); RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt"); job.waitForCompletion(true); Path failures = new Path(opts.workDir, "failures"); fs.delete(failures, true); fs.mkdirs(new Path(opts.workDir, "failures")); // With HDFS permissions on, we need to make sure the Accumulo user can read/move the rfiles FsShell fsShell = new FsShell(conf); fsShell.run(new String[] { "-chmod", "-R", "777", opts.workDir }); connector.tableOperations().importDirectory(opts.getTableName(), opts.workDir + "/files", opts.workDir + "/failures", false); } catch (Exception e) { throw new RuntimeException(e); } finally { if (out != null) out.close(); } return 0; }
From source file:org.apache.accumulo.examples.simple.mapreduce.bulk.BulkIngestExample.java
License:Apache License
public int run(String[] args) { if (args.length != 7) { System.out.println("ERROR: Wrong number of parameters: " + args.length + " instead of 7."); return printUsage(); }/*ww w . j a va2 s .c o m*/ Configuration conf = getConf(); PrintStream out = null; try { Job job = new Job(conf, "bulk ingest example"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MapClass.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(ReduceClass.class); job.setOutputFormatClass(AccumuloFileOutputFormat.class); Instance instance = new ZooKeeperInstance(args[0], args[1]); String user = args[2]; byte[] pass = args[3].getBytes(); String tableName = args[4]; String inputDir = args[5]; String workDir = args[6]; Connector connector = instance.getConnector(user, pass); TextInputFormat.setInputPaths(job, new Path(inputDir)); AccumuloFileOutputFormat.setOutputPath(job, new Path(workDir + "/files")); FileSystem fs = FileSystem.get(conf); out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "/splits.txt")))); Collection<Text> splits = connector.tableOperations().getSplits(tableName, 100); for (Text split : splits) out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split)))); job.setNumReduceTasks(splits.size() + 1); out.close(); job.setPartitionerClass(RangePartitioner.class); RangePartitioner.setSplitFile(job, workDir + "/splits.txt"); job.waitForCompletion(true); Path failures = new Path(workDir, "failures"); fs.delete(failures, true); fs.mkdirs(new Path(workDir, "failures")); connector.tableOperations().importDirectory(tableName, workDir + "/files", workDir + "/failures", false); } catch (Exception e) { throw new RuntimeException(e); } finally { if (out != null) out.close(); } return 0; }
From source file:org.apache.accumulo.examples.simple.mapreduce.bulk.BulkIngestExample.java
License:Apache License
@Override public int run(String[] args) { Opts opts = new Opts(); opts.parseArgs(BulkIngestExample.class.getName(), args); Configuration conf = getConf(); PrintStream out = null;/*from ww w . j a v a2s.c om*/ try { Job job = JobUtil.getJob(conf); job.setJobName("bulk ingest example"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MapClass.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(ReduceClass.class); job.setOutputFormatClass(AccumuloFileOutputFormat.class); opts.setAccumuloConfigs(job); Connector connector = opts.getConnector(); TextInputFormat.setInputPaths(job, new Path(opts.inputDir)); AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files")); FileSystem fs = FileSystem.get(conf); out = new PrintStream(new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt")))); Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100); for (Text split : splits) out.println(Base64.encodeBase64String(TextUtil.getBytes(split))); job.setNumReduceTasks(splits.size() + 1); out.close(); job.setPartitionerClass(RangePartitioner.class); RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt"); job.waitForCompletion(true); Path failures = new Path(opts.workDir, "failures"); fs.delete(failures, true); fs.mkdirs(new Path(opts.workDir, "failures")); connector.tableOperations().importDirectory(opts.getTableName(), opts.workDir + "/files", opts.workDir + "/failures", false); } catch (Exception e) { throw new RuntimeException(e); } finally { if (out != null) out.close(); } return 0; }
From source file:org.apache.accumulo.server.test.randomwalk.shard.SortTool.java
License:Apache License
public int run(String[] args) throws Exception { Job job = new Job(getConf(), this.getClass().getSimpleName()); job.setJarByClass(this.getClass()); if (job.getJar() == null) { log.error("M/R requires a jar file! Run mvn package."); return 1; }//from ww w. jav a 2 s . co m job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(job, seqFile); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitFile); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setNumReduceTasks(splits.size() + 1); job.setOutputFormatClass(AccumuloFileOutputFormat.class); AccumuloFileOutputFormat.setOutputPath(job, new Path(outputDir)); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }
From source file:org.apache.accumulo.test.randomwalk.shard.SortTool.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf(), this.getClass().getSimpleName()); job.setJarByClass(this.getClass()); if (job.getJar() == null) { log.error("M/R requires a jar file! Run mvn package."); return 1; }/*from w w w. jav a 2 s . c om*/ job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(job, seqFile); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitFile); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setNumReduceTasks(splits.size() + 1); job.setOutputFormatClass(AccumuloFileOutputFormat.class); AccumuloFileOutputFormat.setOutputPath(job, new Path(outputDir)); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }
From source file:org.apache.blur.mapreduce.lib.update.Driver.java
License:Apache License
@Override public int run(String[] args) throws Exception { int c = 0;/*from ww w . ja va2s . c o m*/ if (args.length < 5) { System.err.println( "Usage Driver <table> <mr inc working path> <output path> <zk connection> <reducer multipler> <extra config files...>"); } String table = args[c++]; String mrIncWorkingPathStr = args[c++]; String outputPathStr = args[c++]; String blurZkConnection = args[c++]; int reducerMultipler = Integer.parseInt(args[c++]); for (; c < args.length; c++) { String externalConfigFileToAdd = args[c]; getConf().addResource(new Path(externalConfigFileToAdd)); } Path outputPath = new Path(outputPathStr); Path mrIncWorkingPath = new Path(mrIncWorkingPathStr); FileSystem fileSystem = mrIncWorkingPath.getFileSystem(getConf()); Path newData = new Path(mrIncWorkingPath, NEW); Path inprogressData = new Path(mrIncWorkingPath, INPROGRESS); Path completeData = new Path(mrIncWorkingPath, COMPLETE); Path fileCache = new Path(mrIncWorkingPath, CACHE); fileSystem.mkdirs(newData); fileSystem.mkdirs(inprogressData); fileSystem.mkdirs(completeData); fileSystem.mkdirs(fileCache); List<Path> srcPathList = new ArrayList<Path>(); for (FileStatus fileStatus : fileSystem.listStatus(newData)) { srcPathList.add(fileStatus.getPath()); } if (srcPathList.isEmpty()) { return 0; } List<Path> inprogressPathList = new ArrayList<Path>(); boolean success = false; Iface client = null; try { inprogressPathList = movePathList(fileSystem, inprogressData, srcPathList); Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]"); client = BlurClient.getClientFromZooKeeperConnectionStr(blurZkConnection); waitForOtherSnapshotsToBeRemoved(client, table, MRUPDATE_SNAPSHOT); client.createSnapshot(table, MRUPDATE_SNAPSHOT); TableDescriptor descriptor = client.describe(table); Path tablePath = new Path(descriptor.getTableUri()); BlurInputFormat.setLocalCachePath(job, fileCache); BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT); MultipleInputs.addInputPath(job, tablePath, BlurInputFormat.class, MapperForExistingData.class); for (Path p : inprogressPathList) { FileInputFormat.addInputPath(job, p); MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, MapperForNewData.class); } BlurOutputFormat.setOutputPath(job, outputPath); BlurOutputFormat.setupJob(job, descriptor); job.setReducerClass(UpdateReducer.class); job.setMapOutputKeyClass(IndexKey.class); job.setMapOutputValueClass(IndexValue.class); job.setPartitionerClass(IndexKeyPartitioner.class); job.setGroupingComparatorClass(IndexKeyWritableComparator.class); BlurOutputFormat.setReducerMultiplier(job, reducerMultipler); success = job.waitForCompletion(true); Counters counters = job.getCounters(); LOG.info("Counters [" + counters + "]"); } finally { if (success) { LOG.info("Indexing job succeeded!"); movePathList(fileSystem, completeData, inprogressPathList); } else { LOG.error("Indexing job failed!"); movePathList(fileSystem, newData, inprogressPathList); } if (client != null) { client.removeSnapshot(table, MRUPDATE_SNAPSHOT); } } if (success) { return 0; } else { return 1; } }
From source file:org.apache.crunch.GroupingOptions.java
License:Apache License
public void configure(Job job) { if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); }/*from www. ja v a 2 s .c o m*/ if (groupingComparatorClass != null) { job.setGroupingComparatorClass(groupingComparatorClass); } if (sortComparatorClass != null) { job.setSortComparatorClass(sortComparatorClass); } if (numReducers > 0) { job.setNumReduceTasks(numReducers); } for (Map.Entry<String, String> e : extraConf.entrySet()) { job.getConfiguration().set(e.getKey(), e.getValue()); } }
From source file:org.apache.druid.indexer.SortableBytes.java
License:Apache License
public static void useSortableBytesAsMapOutputKey(Job job, Class<? extends Partitioner> partitionerClass) { job.setMapOutputKeyClass(BytesWritable.class); job.setGroupingComparatorClass(SortableBytesGroupingComparator.class); job.setSortComparatorClass(SortableBytesSortingComparator.class); job.setPartitionerClass(partitionerClass); }
From source file:org.apache.gora.mapreduce.GoraMapper.java
License:Apache License
/** * Initializes the Mapper, and sets input parameters for the job. All of * the records in the dataStore are used as the input. If you want to * include a specific subset, use one of the overloaded methods which takes * query parameter.//w ww.jav a 2 s . co m * @param job the job to set the properties for * @param dataStoreClass the datastore class * @param inKeyClass Map input key class * @param inValueClass Map input value class * @param outKeyClass Map output key class * @param outValueClass Map output value class * @param mapperClass the mapper class extending GoraMapper * @param partitionerClass optional partitioner class * @param reuseObjects whether to reuse objects in serialization */ @SuppressWarnings("rawtypes") public static <K1, V1 extends Persistent, K2, V2> void initMapperJob(Job job, Class<? extends DataStore<K1, V1>> dataStoreClass, Class<K1> inKeyClass, Class<V1> inValueClass, Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException { //set the input via GoraInputFormat GoraInputFormat.setInput(job, dataStoreClass, inKeyClass, inValueClass, reuseObjects); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(outKeyClass); job.setMapOutputValueClass(outValueClass); if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); } }
From source file:org.apache.gora.mapreduce.GoraMapper.java
License:Apache License
/** * Initializes the Mapper, and sets input parameters for the job * @param job the job to set the properties for * @param query the query to get the inputs from * @param dataStore the datastore as the input * @param outKeyClass Map output key class * @param outValueClass Map output value class * @param mapperClass the mapper class extending GoraMapper * @param partitionerClass optional partitioner class * @param reuseObjects whether to reuse objects in serialization *//*from w ww . j a v a 2s.co m*/ @SuppressWarnings("rawtypes") public static <K1, V1 extends Persistent, K2, V2> void initMapperJob(Job job, Query<K1, V1> query, DataStore<K1, V1> dataStore, Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException { //set the input via GoraInputFormat GoraInputFormat.setInput(job, query, dataStore, reuseObjects); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(outKeyClass); job.setMapOutputValueClass(outValueClass); if (partitionerClass != null) { job.setPartitionerClass(partitionerClass); } }