List of usage examples for org.apache.hadoop.mapred JobConf get
public String get(String name)
name
property, null
if no such property exists. From source file:kafka.etl.impl.SimpleKafkaETLJob.java
License:Apache License
public void execute() throws Exception { JobConf conf = createJobConf(); RunningJob runningJob = new JobClient(conf).submitJob(conf); String id = runningJob.getJobID(); System.out.println("Hadoop job id=" + id); runningJob.waitForCompletion();// w ww . j a va 2 s . c o m if (!runningJob.isSuccessful()) throw new Exception("Hadoop ETL job failed! Please check status on http://" + conf.get("mapred.job.tracker") + "/jobdetails.jsp?jobid=" + id); }
From source file:me.tingri.graphs.gimv.JoinMapper.java
License:Apache License
public void configure(JobConf conf) { fieldSeparator = conf.get(CONSTANTS.FIELD_SEPARATOR); vectorIndicator = conf.get(CONSTANTS.VECTOR_INDICATOR); makeSymmetric = (FLAGS.YES.getValue() == Integer.parseInt(conf.get(CONSTANTS.MAKE_SYMMETRIC))) ? FLAGS.YES : FLAGS.NO;/* ww w . j a va2 s .co m*/ }
From source file:me.tingri.graphs.gimv.JoinReducer.java
License:Apache License
public void configure(JobConf conf) { vectorIndicator = conf.get(CONSTANTS.VECTOR_INDICATOR); }
From source file:me.tingri.graphs.gimv.VectorGeneratorMapper.java
License:Apache License
public void configure(JobConf conf) { fieldSeparator = conf.get(CONSTANTS.FIELD_SEPARATOR); makeSymmetric = (FLAGS.YES.getValue() == Integer.parseInt(conf.get(CONSTANTS.MAKE_SYMMETRIC))) ? FLAGS.YES : FLAGS.NO;/*w w w . j a v a 2s. com*/ }
From source file:microbench.TeraSortOnHDFSDataLocal.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, URISyntaxException, MPIException { try {/*from w ww . jav a2 s. c o m*/ if (!TaskAttemptContext.class.isInterface()) { throw new IOException("Currently TeraSort benchmark is supported under Hadoop-2.x runtime"); } parseArgs(args); HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); JobConf jobConf = new JobConf(confPath); conf.put(FS_DEFALUT_NAME, jobConf.get(FS_DEFALUT_NAME)); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { // O communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { System.out.println(TeraSortOnHDFSDataLocal.class.getSimpleName() + " O start."); DataMPIUtil.printArgs(args); } HadoopReader<Text, Text> reader = HadoopIOUtil.getReader(jobConf, inDir, TeraInputFormat.class, rank, MPI_D.COMM_BIPARTITE_O); Text khead = reader.createKey(); Text vhead = reader.createValue(); while (reader.next(khead, vhead)) { // send key-value MPI_D.Send(khead, vhead); } reader.close(); } else if (MPI_D.COMM_BIPARTITE_A != null) { // A communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); if (rank == 0) { System.out.println(TeraSortOnHDFSDataLocal.class.getSimpleName() + " A start."); } HadoopWriter<Text, Text> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir, Text.class, Text.class, TeraOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A); // recv key-value. Object[] keyValue = MPI_D.Recv(); while (keyValue != null) { outrw.write((Text) keyValue[0], (Text) keyValue[1]); keyValue = MPI_D.Recv(); } outrw.close(); } MPI_D.Finalize(); } catch (MPI_D_Exception e) { e.printStackTrace(); } }
From source file:net.darkseraphim.webanalytics.hadoop.csv.CSVTextInputFormat.java
License:Apache License
@Override public RecordReader<LongWritable, List<Text>> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { String quote = conf.get(CSVLineRecordReader.FORMAT_DELIMITER); String separator = conf.get(CSVLineRecordReader.FORMAT_SEPARATOR); conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER); conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR); conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false); System.out.println("[LOG] Created reader"); if (split instanceof FileSplit) { return reader = new CSVLineRecordReader(split, conf); }//w w w .j av a2 s . c om throw new UnsupportedOperationException("Only FileSplits are supported"); }
From source file:net.iponweb.hadoop.streaming.avro.AvroAsJsonOutputFormat.java
License:Apache License
@Override public RecordWriter<Text, NullWritable> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { Schema schema;//from w w w .ja v a 2 s. com Schema.Parser p = new Schema.Parser(); String strSchema = job.get("iow.streaming.output.schema"); if (strSchema == null) { String schemaFile = job.get("iow.streaming.output.schema.file", "streaming_output_schema"); if (job.getBoolean("iow.streaming.schema.use.prefix", false)) { // guess schema from file name // format is: schema:filename // with special keyword default - 'default:filename' String str[] = name.split(":"); if (!str[0].equals("default")) schemaFile = str[0]; name = str[1]; } LOG.info(this.getClass().getSimpleName() + ": Using schema from file: " + schemaFile); File f = new File(schemaFile); schema = p.parse(f); } else { LOG.info(this.getClass().getSimpleName() + ": Using schema from jobconf."); schema = p.parse(strSchema); } if (schema == null) { throw new IOException("Can't find proper output schema"); } DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>()); configureDataFileWriter(writer, job); Path path = FileOutputFormat.getTaskOutputPath(job, name + org.apache.avro.mapred.AvroOutputFormat.EXT); writer.create(schema, path.getFileSystem(job).create(path)); return createRecordWriter(writer, schema); }
From source file:net.iponweb.hadoop.streaming.parquet.ParquetAsTextOutputFormat.java
License:Apache License
public RecordWriter<Text, Text> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { // find and load schema String writeSchema = job.get("iow.streaming.output.schema"); MessageType s;/* w w w . ja v a 2 s . c o m*/ if (writeSchema == null) { String schemaFile = job.get("iow.streaming.output.schema.file", "streaming_output_schema"); if (job.getBoolean("iow.streaming.schema.use.prefix", false)) { // guess schema from file name // format is: schema:filename // with special keyword default - 'default:filename' String str[] = name.split(":"); if (!str[0].equals("default")) schemaFile = str[0]; name = str[1]; } LOG.info("Using schema: " + schemaFile); File f = new File(schemaFile); try { BufferedReader reader = new BufferedReader(new FileReader(f)); StringBuilder r = new StringBuilder(); String line; while ((line = reader.readLine()) != null) r.append(line); writeSchema = r.toString(); } catch (Throwable e) { LOG.error("Can't read schema file " + schemaFile); Throwables.propagateIfPossible(e, IOException.class); throw new RuntimeException(e); } } s = MessageTypeParser.parseMessageType(writeSchema); setWriteSupportClass(job, GroupWriteSupport.class); GroupWriteSupport.setSchema(s, job); CompressionCodecName codec = getCodec(job); String extension = codec.getExtension() + ".parquet"; Path file = getDefaultWorkFile(job, name, extension); ParquetRecordWriter<SimpleGroup> realWriter; try { realWriter = (ParquetRecordWriter<SimpleGroup>) realOutputFormat.getRecordWriter(job, file, codec); } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } return createRecordWriter(realWriter, fs, job, name, progress); }
From source file:net.peacesoft.nutch.crawl.ReFetcher.java
License:Apache License
public void configure(JobConf job) { setConf(job);/*ww w. j a v a2s . c o m*/ this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY); this.storingContent = isStoringContent(job); this.parsing = isParsing(job); }
From source file:net.peacesoft.nutch.crawl.ReGenerator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or * not is read from the crawl.generate.filter property in the configuration * files. If the property is not found, the URLs are filtered. Same for the * normalisation./*from ww w .ja va 2 s. co m*/ * * @param dbDir Crawl database directory * @param segments Segments directory * @param numLists Number of reduce tasks * @param topN Number of top URLs to be selected * @param curTime Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { try { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ReGenerator: starting at " + sdf.format(start)); LOG.info("ReGenerator: Selecting best-scoring urls due for fetch."); LOG.info("ReGenerator: filtering: " + filter); LOG.info("ReGenerator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("ReGenerator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info( "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) { continue; } // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("ReGenerator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("ReGenerator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); } catch (Exception ex) { LOG.error("ReGenerator generate error: " + ex.toString(), ex); return null; } }