List of usage examples for org.apache.hadoop.mapred JobConf setInt
public void setInt(String name, int value)
name
property to an int
. From source file:com.ricemap.spateDB.util.RandomSpatialGenerator.java
License:Apache License
public static void generateMapReduce(Path file, Prism mbr, long size, long blocksize, Shape shape, String sindex, long seed, int rectsize, RandomShapeGenerator.DistributionType type, boolean overwrite) throws IOException { JobConf job = new JobConf(RandomSpatialGenerator.class); job.setJobName("Generator"); FileSystem outFs = file.getFileSystem(job); // Overwrite output file if (outFs.exists(file)) { if (overwrite) outFs.delete(file, true);// w w w . j av a 2s . c om else throw new RuntimeException( "Output file '" + file + "' already exists and overwrite flag is not set"); } // Set generation parameters in job job.setLong(RandomShapeGenerator.GenerationSize, size); SpatialSite.setPrism(job, RandomShapeGenerator.GenerationMBR, mbr); if (seed != 0) job.setLong(RandomShapeGenerator.GenerationSeed, seed); if (rectsize != 0) job.setInt(RandomShapeGenerator.GenerationRectSize, rectsize); if (type != null) job.set(RandomShapeGenerator.GenerationType, type.toString()); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); // Set input format and map class job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Repartition.RepartitionMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); SpatialSite.setShapeClass(job, shape.getClass()); if (blocksize != 0) { job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blocksize); } CellInfo[] cells; if (sindex == null) { cells = new CellInfo[] { new CellInfo(1, mbr) }; } else if (sindex.equals("grid")) { GridInfo gridInfo = new GridInfo(mbr.t1, mbr.x1, mbr.y1, mbr.t2, mbr.x2, mbr.y2); FileSystem fs = file.getFileSystem(job); if (blocksize == 0) { blocksize = fs.getDefaultBlockSize(file); } int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, file, blocksize); gridInfo.calculateCellDimensions(numOfCells); cells = gridInfo.getAllCells(); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } SpatialSite.setCells(job, cells); // Do not set a reduce function. Use the default identity reduce function if (cells.length == 1) { // All objects are in one partition. No need for a reduce phase job.setNumReduceTasks(0); } else { // More than one partition. Need a reduce phase to group shapes of the // same partition together job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); } // Set output path FileOutputFormat.setOutputPath(job, file); if (sindex == null || sindex.equals("grid")) { job.setOutputFormat(GridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } JobClient.runJob(job); // Concatenate all master files into one file FileStatus[] resultFiles = outFs.listStatus(file, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains("_master"); } }); String ext = resultFiles[0].getPath().getName() .substring(resultFiles[0].getPath().getName().lastIndexOf('.')); Path masterPath = new Path(file, "_master" + ext); OutputStream destOut = outFs.create(masterPath); byte[] buffer = new byte[4096]; for (FileStatus f : resultFiles) { InputStream in = outFs.open(f.getPath()); int bytes_read; do { bytes_read = in.read(buffer); if (bytes_read > 0) destOut.write(buffer, 0, bytes_read); } while (bytes_read > 0); in.close(); outFs.delete(f.getPath(), false); } destOut.close(); // Plot an image for the partitions used in file Path imagePath = new Path(file, "_partitions.png"); int imageSize = (int) (Math.sqrt(cells.length) * 300); Plot.plotLocal(masterPath, imagePath, new Partition(), imageSize, imageSize, Color.BLACK, false, false, false); }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java
License:Apache License
static void updateJobConf(JobConf jobConf, TaskAttemptID taskAttemptID, int partition) { //--------------------------------------------------------------------------------- //Based on the localizeConfiguration(...) method from Task.java, part of Apache Hadoop 1.2.0, //licensed under Apache License, Version 2.0 //---------------------------------------------------------------------------------- jobConf.set("mapred.tip.id", taskAttemptID.getTaskID().toString()); jobConf.set("mapred.task.id", taskAttemptID.toString()); jobConf.setBoolean("mapred.task.is.map", false); jobConf.setInt("mapred.task.partition", partition); jobConf.set("mapred.job.id", taskAttemptID.getJobID().toString()); //--------------------------------------------------------------------------------- //Based on the localizeConfiguration(...) method from Task.java, part of Apache Hadoop 2.2.0, //licensed under Apache License, Version 2.0 //---------------------------------------------------------------------------------- jobConf.set(TASK_ID, taskAttemptID.getTaskID().toString()); jobConf.set(TASK_ATTEMPT_ID, taskAttemptID.toString()); jobConf.setBoolean(TASK_ISMAP, false); jobConf.setInt(TASK_PARTITION, partition); jobConf.set(ID, taskAttemptID.getJobID().toString()); //---------------------------------------------------------------------------------- }
From source file:com.scaleoutsoftware.soss.hserver.NamedMapInputFormatMapred.java
License:Apache License
/** * Sets {@link com.scaleoutsoftware.soss.client.map.NamedMap} as an input source for the job. * * @param configuration job to modify//from w w w. j a v a2 s . co m * @param map name of the map to be used as a job input * @param <K> the type of the key * @param <V> the type of the value */ public static <K, V> void setNamedMap(JobConf configuration, NamedMap<K, V> map) { configuration.setInt(inputAppIdProperty, map.getMapId()); CustomSerializer<K> keySerializer = map.getKeySerializer(); CustomSerializer<V> valueSerializer = map.getValueSerializer(); configuration.setClass(inputNamedMapKeySerializerProperty, keySerializer.getClass(), Object.class); configuration.setClass(inputNamedMapValueSerializerProperty, valueSerializer.getClass(), Object.class); configuration.setInt(SERIALIZATION_MODE, map.getSerializationMode().ordinal()); if (keySerializer.getObjectClass() != null) { configuration.setClass(inputNamedMapKeyProperty, keySerializer.getObjectClass(), Object.class); } if (valueSerializer.getObjectClass() != null) { configuration.setClass(inputNamedMapValueProperty, valueSerializer.getObjectClass(), Object.class); } }
From source file:com.scaleoutsoftware.soss.hserver.Test_MapToMapCopyMapred.java
License:Apache License
public int run(String[] args) throws Exception { final NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap("mapr-i", new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class)); final NamedMap<IntWritable, Text> outputMap = NamedMapFactory.getMap("mapr-o", new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class)); inputMap.clear();/* www . j av a 2 s. c o m*/ outputMap.clear(); Thread.sleep(15000); BulkLoader<IntWritable, Text> put = inputMap.getBulkLoader(); String content = "xcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; Text contentW = new Text(content); IntWritable count = new IntWritable(); int expectedSize = 10000; for (int i = 0; i < expectedSize; i++) { count.set(i); put.put(count, contentW); } put.close(); InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis()) .addClass(Test_MapToMapCopyMapred.class).load(); JobConf configuration = new JobConf(getConf(), Test_MapToMapCopyMapred.class); configuration.setInt("mapred.hserver.setting.reducer.usememorymappedfiles", 0); configuration.setMapOutputKeyClass(IntWritable.class); configuration.setMapOutputValueClass(Text.class); configuration.setOutputKeyClass(IntWritable.class); configuration.setOutputValueClass(Text.class); configuration.setInputFormat(NamedMapInputFormatMapred.class); configuration.setOutputFormat(NamedMapOutputFormatMapred.class); NamedMapInputFormatMapred.setNamedMap(configuration, inputMap); NamedMapOutputFormatMapred.setNamedMap(configuration, outputMap); assertEquals(inputMap.size(), outputMap.size() + expectedSize); // should be 0 + expected HServerJobClient.runJob(configuration, false, grid); assertEquals(inputMap.size(), outputMap.size()); inputMap.clear(); outputMap.clear(); grid.unload(); return 1; }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from w ww . j a v a2s . co m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.tomslabs.grid.avro.TextTypedBytesToAvroOutputFormat.java
License:Apache License
/** Enable output compression using the deflate codec and specify its level. */ public static void setDeflateLevel(JobConf job, int level) { FileOutputFormat.setCompressOutput(job, true); job.setInt(DEFLATE_LEVEL_KEY, level); }
From source file:com.uber.hoodie.hadoop.InputFormatTestUtil.java
License:Apache License
public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE); String startCommitTimestampName = String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(startCommitTimestampName, startCommit); String maxCommitPulls = String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); }
From source file:com.yolodata.tbana.cascading.csv.CSVLine.java
License:Open Source License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { if (hasZippedFiles(FileInputFormat.getInputPaths(conf))) throw new IllegalStateException( "cannot read zip files: " + Arrays.toString(FileInputFormat.getInputPaths(conf))); conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER); conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR); conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false); conf.setInt(CSVNLineInputFormat.LINES_PER_MAP, 40000); conf.setInputFormat(CSVNLineInputFormat.class); }
From source file:contrail.stages.TestValidateGraph.java
License:Open Source License
@Test public void testMapper() { ArrayList<MapperTestCase> test_cases = new ArrayList<MapperTestCase>(); test_cases.add(createMapTest());//from w ww. jav a 2s. co m ValidateGraphMapper mapper = new ValidateGraphMapper(); JobConf job = new JobConf(ValidateGraphMapper.class); ReporterMock reporter_mock = new ReporterMock(); Reporter reporter = reporter_mock; for (MapperTestCase test_case : test_cases) { job.setInt("K", test_case.K); mapper.configure(job); // We need a new collector for each invocation because the // collector stores the outputs of the mapper. AvroCollectorMock<Pair<CharSequence, ValidateMessage>> collector_mock = new AvroCollectorMock<Pair<CharSequence, ValidateMessage>>(); try { mapper.map(test_case.input, collector_mock, reporter); } catch (IOException exception) { fail("IOException occured in map: " + exception.getMessage()); } assertMapperOutput(test_case, collector_mock); } }
From source file:edu.iu.benchmark.JobLauncher.java
License:Apache License
private Job configureBenchmarkJob(String cmd, int bytesPerPartition, int numPartitions, int numMappers, int numIterations, Path inputDirPath, Path outputDirPath) throws IOException, URISyntaxException { Job job = Job.getInstance(getConf(), "benchmark_job"); FileInputFormat.setInputPaths(job, inputDirPath); FileOutputFormat.setOutputPath(job, outputDirPath); job.setInputFormatClass(SingleFileInputFormat.class); job.setJarByClass(JobLauncher.class); job.setMapperClass(BenchmarkMapper.class); org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration(); jobConf.set("mapreduce.framework.name", "map-collective"); jobConf.setNumMapTasks(numMappers);//from w w w. ja va2 s . c o m job.setNumReduceTasks(0); jobConf.set(Constants.BENCHMARK_CMD, cmd); jobConf.setInt(Constants.BYTES_PER_PARTITION, bytesPerPartition); jobConf.setInt(Constants.NUM_PARTITIONS, numPartitions); jobConf.setInt(Constants.NUM_MAPPERS, numMappers); jobConf.setInt(Constants.NUM_ITERATIONS, numIterations); return job; }