List of usage examples for org.apache.hadoop.mapred JobConf get
public String get(String name)
name
property, null
if no such property exists. From source file:com.taobao.data.hive.hook.automapjoin.AutoMapJoinPreJobHook.java
License:Apache License
@Override public void run(SessionState session, QueryPlan queryPlan, JobConf job, Integer taskId) { if (!job.getBoolean(HiveConf.ConfVars.HIVECONVERTJOIN.varname, false) || null == job.get(AUTO_JOIN_STATICS_DIR) || null == queryPlan) { return;//from ww w .j ava 2s .c om } List<Task<? extends Serializable>> rootTasks = queryPlan.getRootTasks(); String taskIdStr = "Stage-" + taskId.toString(); dfsTaskTag(rootTasks, taskIdStr); // dfs, fill the member variable 'taskTag' StringBuilder sb = new StringBuilder(); sb.append(queryPlan.getQueryId()); sb.append('\u0001'); sb.append(taskIdStr); sb.append('\u0001'); switch (taskTag) { case Task.NO_TAG: return; case Task.BACKUP_COMMON_JOIN: sb.append("Backup Common Join"); break; case Task.COMMON_JOIN: sb.append("Common Join"); break; case Task.CONVERTED_LOCAL_MAPJOIN: sb.append("Converted Local Map Join"); break; case Task.CONVERTED_MAPJOIN: sb.append("Converted Map Join"); break; case Task.LOCAL_MAPJOIN: sb.append("Local Map Join"); break; } String sessionId = session.getSessionId(); File dir = new File(job.get(AUTO_JOIN_STATICS_DIR)); if (!dir.exists()) { dir.mkdir(); } File file = new File(dir, sessionId + ".stat"); synchronized (lock) { FileOutputStream fout = null; PrintStream pout = null; try { if (!file.exists()) { file.createNewFile(); } fout = new FileOutputStream(file, true); pout = new PrintStream(fout); pout.println(sb.toString()); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (null != pout) { pout.close(); } } } // reset dfs results taskTag = Task.NO_TAG; found = false; }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from w w w . j a v a 2 s . c om*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.test.hadoop.JhhSort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.// www. j a v a 2s. co m * * @throws IOException * When there is communication problems with the job tracker. */ @SuppressWarnings({ "rawtypes" }) public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), JhhSort.class); jobConf.setJobName("sorter"); jobConf.set("mapred.job.tracker", "192.168.12.200:9001"); jobConf.set("fs.default.name", "hdfs://192.168.12.200:9000"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.5); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = TextInputFormat.class; Class<? extends OutputFormat> outputFormatClass = TextOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = LongWritable.class; Class<? extends Writable> outputValueClass = LongWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.test.hadoop.JhhSum.java
License:Apache License
@SuppressWarnings({ "rawtypes" }) public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), JhhSum.class); jobConf.setJobName("sum"); jobConf.set("mapred.job.tracker", "192.168.12.200:9001"); jobConf.set("fs.default.name", "hdfs://192.168.12.200:9000"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(LongSumReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.5); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); }//from ww w . j ava 2 s . c o m Class<? extends InputFormat> inputFormatClass = JhhInputFormat.class; Class<? extends OutputFormat> outputFormatClass = TextOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = Text.class; Class<? extends Writable> outputValueClass = LongWritable.class; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { otherArgs.add(args[i]); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.tomslabs.grid.avro.JSONTextToAvroRecordReducer.java
License:Apache License
public void configure(JobConf job) { this.job = job; this.schema = Schema.parse(job.get(AvroJob.OUTPUT_SCHEMA)); }
From source file:com.toshiba.mwcloud.gs.hadoop.mapred.GSRowRecordWriter.java
License:Apache License
/** * <div lang="ja">/*from w w w . ja va 2s .c o m*/ * * @param confConfiguration * @throws IOExceptionGridDB?????? * </div><div lang="en"> * Constructor * @param conf Configuration object * @throws IOException an exception occurred in GridDB * </div> */ public GSRowRecordWriter(JobConf conf) throws IOException { TaskAttemptContext context = new TaskAttemptContextImpl(conf, TaskAttemptID.forName(conf.get("mapred.task.id"))); writer_ = new GDRecordWriter(context); }
From source file:com.tuplejump.calliope.hadoop.ColumnFamilyInputFormat.java
License:Apache License
public org.apache.hadoop.mapred.RecordReader<ByteBuffer, SortedMap<ByteBuffer, Column>> getRecordReader( org.apache.hadoop.mapred.InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { TaskAttemptContext tac = HadoopCompat.newMapContext(jobConf, TaskAttemptID.forName(jobConf.get(MAPRED_TASK_ID)), null, null, null, new ReporterWrapper(reporter), null);//from w w w .ja v a 2 s.c o m ColumnFamilyRecordReader recordReader = new ColumnFamilyRecordReader( jobConf.getInt(CASSANDRA_HADOOP_MAX_KEY_SIZE, CASSANDRA_HADOOP_MAX_KEY_SIZE_DEFAULT)); recordReader.initialize((org.apache.hadoop.mapreduce.InputSplit) split, tac); return recordReader; }
From source file:com.tuplejump.calliope.hadoop.cql3.CqlInputFormat.java
License:Apache License
public RecordReader<Long, Row> getRecordReader(org.apache.hadoop.mapred.InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { TaskAttemptContext tac = HadoopCompat.newMapContext(jobConf, TaskAttemptID.forName(jobConf.get(MAPRED_TASK_ID)), null, null, null, new ReporterWrapper(reporter), null);//from w w w . j av a 2s. com CqlRecordReader recordReader = new CqlRecordReader(); recordReader.initialize((org.apache.hadoop.mapreduce.InputSplit) split, tac); return recordReader; }
From source file:com.tuplejump.calliope.hadoop.cql3.CqlPagingInputFormat.java
License:Apache License
public RecordReader<Map<String, ByteBuffer>, Map<String, ByteBuffer>> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { TaskAttemptContext tac = HadoopCompat.newMapContext(jobConf, TaskAttemptID.forName(jobConf.get(MAPRED_TASK_ID)), null, null, null, new ReporterWrapper(reporter), null);/*from w ww . j a v a 2 s .c o m*/ CqlPagingRecordReader recordReader = new CqlPagingRecordReader(); recordReader.initialize((org.apache.hadoop.mapreduce.InputSplit) split, tac); return recordReader; }
From source file:com.twitter.maple.hbase.mapred.TableInputFormat.java
License:Apache License
public void configure(JobConf job) { String tableName = TableInputFormat.getTableName(job); String colArg = job.get(COLUMN_LIST); String[] colNames = colArg.split(" "); byte[][] m_cols = new byte[colNames.length][]; for (int i = 0; i < m_cols.length; i++) { m_cols[i] = Bytes.toBytes(colNames[i]); }// w w w . ja v a 2s .c o m setInputColumns(m_cols); try { setHTable(new HTable(HBaseConfiguration.create(job), tableName)); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } }