List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat
public void setOutputFormat(Class<? extends OutputFormat> theClass)
From source file:com.ricemap.spateDB.operations.RecordCount.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing// w ww .j av a2s . c o m * @param conf * @param fs * @param file * @return * @throws IOException */ public static long recordCountMapReduce(FileSystem fs, Path file) throws IOException { JobConf job = new JobConf(RecordCount.class); Path outputPath = new Path(file.toUri().getPath() + ".linecount"); FileSystem outFs = outputPath.getFileSystem(job); outFs.delete(outputPath, true); job.setJobName("LineCount"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(1); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, file); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); // Read job result long lineCount = 0; FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); Text text = new Text(); if (lineReader.readLine(text) > 0) { lineCount = Long.parseLong(text.toString()); } lineReader.close(); } } outFs.delete(outputPath, true); return lineCount; }
From source file:com.ricemap.spateDB.operations.Repartition.java
License:Apache License
/** * Repartitions an input file according to the given list of cells. * @param inFile//from w ww . java 2 s . c o m * @param outPath * @param cellInfos * @param pack * @param rtree * @param overwrite * @throws IOException */ public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize, CellInfo[] cellInfos, String sindex, boolean overwrite, boolean columnar) throws IOException { JobConf job = new JobConf(Repartition.class); job.setJobName("Repartition"); FileSystem outFs = outPath.getFileSystem(job); // Overwrite output file if (outFs.exists(outPath)) { if (overwrite) outFs.delete(outPath, true); else throw new RuntimeException( "Output file '" + outPath + "' already exists and overwrite flag is not set"); } // Decide which map function to use depending on the type of global index if (sindex.equals("rtree")) { // Repartition without replication job.setMapperClass(RepartitionMapNoReplication.class); } else { // Repartition with replication (grid and r+tree) job.setMapperClass(RepartitionMap.class); } job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(stockShape.getClass()); ShapeInputFormat.setInputPaths(job, inFile); job.setInputFormat(ShapeInputFormat.class); boolean pack = sindex.equals("r+tree"); boolean expand = sindex.equals("rtree"); job.setBoolean(SpatialSite.PACK_CELLS, pack); job.setBoolean(SpatialSite.EXPAND_CELLS, expand); job.setStrings(SpatialSite.STORAGE_MODE, columnar ? "columnar" : "normal"); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); // Set default parameters for reading input file SpatialSite.setShapeClass(job, stockShape.getClass()); FileOutputFormat.setOutputPath(job, outPath); if (sindex.equals("grid")) { job.setOutputFormat(GridOutputFormat.class); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { // For now, the two types of local index are the same job.setOutputFormat(RTreeGridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } // Copy block size from source file if it's globally indexed FileSystem inFs = inFile.getFileSystem(job); if (blockSize == 0) { GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile); if (globalIndex != null) { blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename)) .getBlockSize(); LOG.info("Automatically setting block size to " + blockSize); } } if (blockSize != 0) job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blockSize); SpatialSite.setCells(job, cellInfos); job.setBoolean(SpatialSite.OVERWRITE, overwrite); // Set reduce function job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); // Set output committer that combines output files together job.setOutputCommitter(RepartitionOutputCommitter.class); JobClient.runJob(job); }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Sample a ratio of the file through a MapReduce job * @param fs// w w w. ja v a2 s. c o m * @param files * @param ratio * @param threshold - Maximum number of elements to be sampled * @param output * @param inObj * @return * @throws IOException */ public static <T extends TextSerializable, O extends TextSerializable> int sampleMapReduceWithRatio( FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj, O outObj) throws IOException { JobConf job = new JobConf(FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("Sample"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setClass(InClass, inObj.getClass(), TextSerializable.class); job.setClass(OutClass, outObj.getClass(), TextSerializable.class); job.setMapperClass(Map.class); job.setLong(RANDOM_SEED, seed); job.setFloat(SAMPLE_RATIO, (float) ratio); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(0); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, files); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job RunningJob run_job = JobClient.runJob(job); Counters counters = run_job.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue(); // Ratio of records to return from output based on the threshold // Note that any number greater than or equal to one will cause all // elements to be returned final double selectRatio = (double) threshold / resultCount; // Read job result int result_size = 0; if (output != null) { Text line = new Text(); FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); try { while (lineReader.readLine(line) > 0) { if (Math.random() < selectRatio) { if (output != null) { outObj.fromText(line); output.collect(outObj); } result_size++; } } } catch (RuntimeException e) { e.printStackTrace(); } lineReader.close(); } } } outFs.delete(outputPath, true); return result_size; }
From source file:com.ricemap.spateDB.util.RandomSpatialGenerator.java
License:Apache License
public static void generateMapReduce(Path file, Prism mbr, long size, long blocksize, Shape shape, String sindex, long seed, int rectsize, RandomShapeGenerator.DistributionType type, boolean overwrite) throws IOException { JobConf job = new JobConf(RandomSpatialGenerator.class); job.setJobName("Generator"); FileSystem outFs = file.getFileSystem(job); // Overwrite output file if (outFs.exists(file)) { if (overwrite) outFs.delete(file, true);// ww w . j av a 2 s. c o m else throw new RuntimeException( "Output file '" + file + "' already exists and overwrite flag is not set"); } // Set generation parameters in job job.setLong(RandomShapeGenerator.GenerationSize, size); SpatialSite.setPrism(job, RandomShapeGenerator.GenerationMBR, mbr); if (seed != 0) job.setLong(RandomShapeGenerator.GenerationSeed, seed); if (rectsize != 0) job.setInt(RandomShapeGenerator.GenerationRectSize, rectsize); if (type != null) job.set(RandomShapeGenerator.GenerationType, type.toString()); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); // Set input format and map class job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Repartition.RepartitionMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); SpatialSite.setShapeClass(job, shape.getClass()); if (blocksize != 0) { job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blocksize); } CellInfo[] cells; if (sindex == null) { cells = new CellInfo[] { new CellInfo(1, mbr) }; } else if (sindex.equals("grid")) { GridInfo gridInfo = new GridInfo(mbr.t1, mbr.x1, mbr.y1, mbr.t2, mbr.x2, mbr.y2); FileSystem fs = file.getFileSystem(job); if (blocksize == 0) { blocksize = fs.getDefaultBlockSize(file); } int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, file, blocksize); gridInfo.calculateCellDimensions(numOfCells); cells = gridInfo.getAllCells(); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } SpatialSite.setCells(job, cells); // Do not set a reduce function. Use the default identity reduce function if (cells.length == 1) { // All objects are in one partition. No need for a reduce phase job.setNumReduceTasks(0); } else { // More than one partition. Need a reduce phase to group shapes of the // same partition together job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); } // Set output path FileOutputFormat.setOutputPath(job, file); if (sindex == null || sindex.equals("grid")) { job.setOutputFormat(GridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } JobClient.runJob(job); // Concatenate all master files into one file FileStatus[] resultFiles = outFs.listStatus(file, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains("_master"); } }); String ext = resultFiles[0].getPath().getName() .substring(resultFiles[0].getPath().getName().lastIndexOf('.')); Path masterPath = new Path(file, "_master" + ext); OutputStream destOut = outFs.create(masterPath); byte[] buffer = new byte[4096]; for (FileStatus f : resultFiles) { InputStream in = outFs.open(f.getPath()); int bytes_read; do { bytes_read = in.read(buffer); if (bytes_read > 0) destOut.write(buffer, 0, bytes_read); } while (bytes_read > 0); in.close(); outFs.delete(f.getPath(), false); } destOut.close(); // Plot an image for the partitions used in file Path imagePath = new Path(file, "_partitions.png"); int imageSize = (int) (Math.sqrt(cells.length) * 300); Plot.plotLocal(masterPath, imagePath, new Partition(), imageSize, imageSize, Color.BLACK, false, false, false); }
From source file:com.scaleoutsoftware.soss.hserver.Test_MapToMapCopyMapred.java
License:Apache License
public int run(String[] args) throws Exception { final NamedMap<IntWritable, Text> inputMap = NamedMapFactory.getMap("mapr-i", new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class)); final NamedMap<IntWritable, Text> outputMap = NamedMapFactory.getMap("mapr-o", new WritableSerializer(IntWritable.class), new WritableSerializer(Text.class)); inputMap.clear();/*from w ww. ja v a 2 s . c o m*/ outputMap.clear(); Thread.sleep(15000); BulkLoader<IntWritable, Text> put = inputMap.getBulkLoader(); String content = "xcccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; Text contentW = new Text(content); IntWritable count = new IntWritable(); int expectedSize = 10000; for (int i = 0; i < expectedSize; i++) { count.set(i); put.put(count, contentW); } put.close(); InvocationGrid grid = HServerJob.getInvocationGridBuilder("MyGrid" + System.currentTimeMillis()) .addClass(Test_MapToMapCopyMapred.class).load(); JobConf configuration = new JobConf(getConf(), Test_MapToMapCopyMapred.class); configuration.setInt("mapred.hserver.setting.reducer.usememorymappedfiles", 0); configuration.setMapOutputKeyClass(IntWritable.class); configuration.setMapOutputValueClass(Text.class); configuration.setOutputKeyClass(IntWritable.class); configuration.setOutputValueClass(Text.class); configuration.setInputFormat(NamedMapInputFormatMapred.class); configuration.setOutputFormat(NamedMapOutputFormatMapred.class); NamedMapInputFormatMapred.setNamedMap(configuration, inputMap); NamedMapOutputFormatMapred.setNamedMap(configuration, outputMap); assertEquals(inputMap.size(), outputMap.size() + expectedSize); // should be 0 + expected HServerJobClient.runJob(configuration, false, grid); assertEquals(inputMap.size(), outputMap.size()); inputMap.clear(); outputMap.clear(); grid.unload(); return 1; }
From source file:com.spotify.hdfs2cass.BulkLoader.java
License:Apache License
public int run(String[] args) throws Exception { CommandLine cmdLine = parseOptions(args); String[] inputPaths = cmdLine.getOptionValues('i'); String seedNodeHost = cmdLine.getOptionValue('h'); String seedNodePort = cmdLine.getOptionValue('p', "9160"); String keyspace = cmdLine.getOptionValue('k'); String colfamily = cmdLine.getOptionValue('c'); int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0")); Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0")); String poolName = cmdLine.getOptionValue("pool"); ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort); clusterInfo.init(keyspace);// w ww. jav a 2 s . c om final String partitionerClass = clusterInfo.getPartitionerClass(); final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")), clusterInfo.getNumClusterNodes()); Configuration conf = new Configuration(); ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily); ConfigHelper.setOutputInitialAddress(conf, seedNodeHost); ConfigHelper.setOutputRpcPort(conf, seedNodePort); ConfigHelper.setOutputPartitioner(conf, partitionerClass); if (cmdLine.hasOption('s')) { conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32")); } if (cmdLine.hasOption('M')) { conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M')); } if (cmdLine.hasOption('C')) { ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C')); } if (cmdLine.hasOption('b')) { conf.setBoolean("com.spotify.hdfs2cass.base64", true); } JobConf job = new JobConf(conf); if (mappers > 0) job.setNumMapTasks(mappers); if (reducers > 0) job.setNumReduceTasks(reducers); if (copiers > 0) job.set("mapred.reduce.parallel.copies", copiers.toString()); if (poolName != null) job.set("mapred.fairscheduler.pool", poolName); // set the nodes as a param for the other hadoop nodes clusterInfo.setConf(job); String jobName = "bulkloader-hdfs-to-cassandra"; if (cmdLine.hasOption('n')) jobName += "-" + cmdLine.getOptionValue('n'); job.setJobName(jobName); job.setJarByClass(BulkLoader.class); job.setInputFormat(AvroAsTextInputFormat.class); for (String inputPath : inputPaths) { FileInputFormat.addInputPath(job, new Path(inputPath)); } //map just outputs text, reduce sends to cassandra job.setMapperClass(MapToText.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(CassandraPartitioner.class); job.setReducerClass(ReduceTextToCassandra.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); if (cmdLine.hasOption('s')) job.setOutputFormat(BulkOutputFormat.class); else job.setOutputFormat(ColumnFamilyOutputFormat.class); JobClient.runJob(job); return 0; }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation.// w w w . jav a 2 s .co m * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.TCG.Nutch_DNS.HostDb.java
License:Apache License
public static JobConf createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("crawldb " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (FileSystem.get(job).exists(current)) { FileInputFormat.addInputPath(job, current); }//from w ww . j a v a 2s .com job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(HostDbFilter.class); job.setReducerClass(HostDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); // https://issues.apache.org/jira/browse/NUTCH-1110 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); return job; }
From source file:com.TCG.Nutch_DNS.Injector.java
License:Apache License
public void inject(Path hostDb, Path crawlDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: hostDb: " + hostDb); LOG.info("Injector: carwlDb: " + crawlDb); }// ww w .java 2 s .c o m Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected host to host db entries."); } FileSystem fs = FileSystem.get(getConf()); // determine if the crawldb already exists boolean dbExists = fs.exists(hostDb); JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + hostDb); FileInputFormat.addInputPath(sortJob, crawlDb); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); if (dbExists) { sortJob.setOutputFormat(SequenceFileOutputFormat.class); //HostReducer,host sortJob.setReducerClass(ExitHostReducer.class); } else { sortJob.setOutputFormat(MapFileOutputFormat.class); //HostReducer,host sortJob.setReducerClass(NotExitHostReducer.class); sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); } sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = null; try { mapJob = JobClient.runJob(sortJob); } catch (IOException e) { fs.delete(tempDir, true); throw e; } if (dbExists) { // merge with existing host db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected hostDb into old hostDb."); } JobConf mergeJob = HostDb.createJob(getConf(), hostDb); FileInputFormat.addInputPath(mergeJob, tempDir); //HostDb.createJobReducer:HostDbReducer mergeJob.setReducerClass(InjectReducer.class); try { RunningJob merge = JobClient.runJob(mergeJob); } catch (IOException e) { fs.delete(tempDir, true); throw e; } HostDb.install(mergeJob, hostDb); } else { HostDb.install(sortJob, hostDb); } // clean up fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:com.test.hadoop.JhhSort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job./*from ww w.j av a 2s. c om*/ * * @throws IOException * When there is communication problems with the job tracker. */ @SuppressWarnings({ "rawtypes" }) public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), JhhSort.class); jobConf.setJobName("sorter"); jobConf.set("mapred.job.tracker", "192.168.12.200:9001"); jobConf.set("fs.default.name", "hdfs://192.168.12.200:9000"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.5); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = TextInputFormat.class; Class<? extends OutputFormat> outputFormatClass = TextOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = LongWritable.class; Class<? extends Writable> outputValueClass = LongWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }