List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:com.ibm.jaql.io.hbase.TableOutputConfigurator.java
License:Apache License
public void setParallel(JobConf conf) throws Exception { conf.set(TableOutputFormat.OUTPUT_TABLE, location); conf.setOutputKeyClass(JsonHolderDefault.class); conf.setOutputValueClass(JsonHolderDefault.class); HadoopSerializationDefault.register(conf); conf.setOutputKeyComparatorClass(DefaultJsonComparator.class); }
From source file:com.ibm.jaql.lang.expr.core.RegisterExceptionHandler.java
License:Apache License
public static void writeConf(String name, JobConf conf) throws Exception { ThresholdExceptionHandler handler = (ThresholdExceptionHandler) JaqlUtil.getExceptionHandler(); BufferedJsonRecord r = new BufferedJsonRecord(); r.add(ERROR_THRESH_FIELD_NAME, new JsonLong(handler.getMaxExceptions())); String s = JsonUtil.printToString(r); conf.set(name, s); }
From source file:com.ibm.jaql.lang.expr.system.RJaqlInterface.java
License:Apache License
/** * This method provides the functionality of saving simple R objects into HDFS in one of * the formats supported by Jaql so that it can be directly read into Jaql. * @param localPath//from www .ja va 2 s . c o m * @param hdfsPath * @param schemaString * @param format * @param header * @param vector * @return */ public boolean jaqlSave(String localPath, String hdfsPath, String schemaString, String format, boolean header, boolean vector) { if (format.equalsIgnoreCase(FORMAT_DELIM)) { LOG.info("Format: " + FORMAT_DELIM + ", saving to HDFS loc: " + hdfsPath); return RUtil.saveToHDFS(localPath, hdfsPath); } try { JobConf conf = new JobConf(); int DEFAULT_BUFFER_SIZE = 64 * 1024; int bufferSize = conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); BufferedReader reader = new BufferedReader(new FileReader(localPath), bufferSize); LongWritable key = new LongWritable(0); long count = 0; Text value = new Text(); BufferedJsonRecord options = new BufferedJsonRecord(2); BufferedJsonArray headerArray = null; if (header) { String headerString = reader.readLine(); String[] headers = splitPattern.split(headerString); headerArray = new BufferedJsonArray(headers.length); for (int i = 0; i < headers.length; i++) { headerArray.set(i, new JsonString(StringUtils.strip(headers[i], "\""))); } count++; } Schema schema = null; if (schemaString != null) { schema = SchemaFactory.parse(schemaString); } if (headerArray != null) { RecordSchema recordSchema = (RecordSchema) schema; // construct new matching schema List<Field> fields = new LinkedList<Field>(); for (JsonValue fieldName : headerArray) { Field field; if (recordSchema == null) { field = new Field((JsonString) fieldName, SchemaFactory.stringSchema(), false); } else { field = recordSchema.getField((JsonString) fieldName); if (field == null) throw new NullPointerException("header field not in schema: " + fieldName); // FIXME: schema fields that are not in the header are currently consider OK } fields.add(field); } // and set it schema = new RecordSchema(fields, null); } if (schema != null) options.add(DelOptionParser.SCHEMA_NAME, new JsonSchema(schema)); KeyValueImport<LongWritable, Text> converter = null; if (vector) { converter = new FromLinesConverter(); } else { converter = new FromDelConverter(); } LOG.info("Initializing Converter with options: " + options); converter.init(options); Schema tmpSchema = converter.getSchema(); tmpSchema = SchemaTransformation.removeNullability(tmpSchema); if (!tmpSchema.is(JsonType.ARRAY, JsonType.RECORD, JsonType.BOOLEAN, JsonType.DECFLOAT, JsonType.DOUBLE, JsonType.LONG, JsonType.STRING).always()) { throw new IOException("Unrecognized schema type: " + schema.getSchemaType()); } JsonValue outValue = converter.createTarget(); JsonHolder outKeyHolder; JsonHolder outValueHolder; if (format.equalsIgnoreCase(FORMAT_DEFAULT)) { HadoopSerializationDefault.register(conf); outKeyHolder = new JsonHolderDefault(); outValueHolder = new JsonHolderDefault(outValue); LOG.info("Registered serializer for Default format."); } else if (format.equalsIgnoreCase(FORMAT_TEMP)) { // TODO: There should be a better way of doing this. HadoopSerializationTemp // now does it in an ugly way. BufferedJsonRecord tmpOptions = new BufferedJsonRecord(); BufferedJsonRecord outOptions = new BufferedJsonRecord(); outOptions.add(new JsonString("schema"), new JsonSchema(schema)); tmpOptions.add(new JsonString("options"), outOptions); conf.set(ConfSetter.CONFOUTOPTIONS_NAME, tmpOptions.toString()); HadoopSerializationTemp.register(conf); outKeyHolder = new JsonHolderTempKey(null); outValueHolder = new JsonHolderTempValue(); LOG.info("Registered serializer for HadoopTemp format."); } else { throw new IOException("Unrecognized serialization format requested: " + format); } FileSystem fs = FileSystem.get(conf); Path outputPath = new Path(hdfsPath); Writer writer = SequenceFile.createWriter(fs, conf, outputPath, outKeyHolder.getClass(), outValueHolder.getClass()); String line; while ((line = reader.readLine()) != null) { key.set(count++); value.set(line); outValue = converter.convert(key, value, outValue); outValueHolder.value = outValue; writer.append(outKeyHolder, outValueHolder); } LOG.info("Transferred " + count + " line(s)."); reader.close(); writer.close(); } catch (IOException e) { LOG.info("Error in saving object.", e); return false; } return true; }
From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.WikiPageInputFormat.java
License:Open Source License
@Override public RecordReader<LongWritable, Text> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { conf.set(XMLInputFormat.START_TAG_KEY, START_TAG); conf.set(XMLInputFormat.END_TAG_KEY, END_TAG); return new XMLRecordReader((FileSplit) split, conf); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.HashIdMR.java
License:Open Source License
/** * @param inputpath//from ww w.jav a 2s . co m * the path to a unique vertex list. Each line is parsed into (vid, * data) using {@code vidparser} and {@code vdataparser}. * @param outputpath * the path of output directory. * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(HashIdMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(HashIdMapper.class); conf.setReducerClass(HashIdReducer.class); conf.setInputFormat(NLineInputFormat.class); conf.setOutputFormat(MultiDirOutputFormat.class); conf.setInt("mapred.line.input.format.linespermap", linespermap); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("VdataParser", vdataparser.getClass().getName()); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("====== Job: Create integer Id maps for vertices =========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.debug("Lines per map = 6000000"); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("VdataParser = " + vdataparser.getClass().getName()); LOG.info("=========================================================="); JobClient.runJob(conf); LOG.info("=======================Done =====================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortDictMR.java
License:Open Source License
/** * @param inputpath/*from w w w.j a v a 2 s . co m*/ * the path to a rawId to newId dictionary. * @param outputpath * the path of output directory. * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(SortDictMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(SortDictMapper.class); conf.setReducerClass(SortDictReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setBoolean("hashRawVid", hashRawVid); conf.setInt("numChunks", numChunks); conf.set("VidParser", vidparser.getClass().getName()); String outprefix = "vidhashmap"; for (int i = 0; i < numChunks; i++) { MultipleOutputs.addNamedOutput(conf, outprefix + i, TextOutputFormat.class, Text.class, Text.class); } FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("========== Job: Partition the map of rawid -> id ==========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.info("======================================================"); if (hashRawVid) LOG.info("Partition on rawId."); else LOG.info("Partition on newId"); LOG.debug("numChunks = " + numChunks); LOG.debug("VidParser = " + vidparser.getClass().getName()); JobClient.runJob(conf); LOG.info("======================= Done ==========================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR.java
License:Open Source License
public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(SortEdgeMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(SortEdgeMapper.class); conf.setReducerClass(SortEdgeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInt("numChunks", numChunks); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("EdataParser", edataparser.getClass().getName()); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("==== Job: Partition the input edges by hash(sourceid) ========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.debug("numChunks = " + numChunks); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("EdataParser = " + edataparser.getClass().getName()); LOG.info("==============================================================="); JobClient.runJob(conf);//from w ww .j ava 2 s .c om LOG.info("=================== Done ====================================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.TransEdgeMR.java
License:Open Source License
/** * @param inputpath/*w w w . j a va 2s . c om*/ * path of the partitioned edge list * @param outputpath * path of the output directory * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(TransEdgeMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(TransEdgeMapper.class); conf.setReducerClass(TransEdgeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInt("numChunks", numChunks); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("EdataParser", edataparser.getClass().getName()); conf.set("dictionaryPath", dictionaryPath); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("============= Job: Normalize Ids in Edges ===================="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.info("Dictionary = " + dictionaryPath); LOG.debug("numChunks = " + numChunks); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("EdataParser = " + edataparser.getClass().getName()); LOG.info("==============================================================="); JobClient.runJob(conf); LOG.info("========================= Done ==============================="); }
From source file:com.kadwa.hadoop.DistExec.java
License:Open Source License
/** * Initialize ExecFilesMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments// w w w. ja va2s .c o m * @return true if it is necessary to launch a job. */ private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); jobConf.set(EXEC_CMD_LABEL, args.execCmd); //set boolean values jobConf.setBoolean(Options.REDIRECT_ERROR_TO_OUT.propertyname, args.flags.contains(Options.REDIRECT_ERROR_TO_OUT)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path stagingArea; try { stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf); } catch (InterruptedException e) { throw new IOException(e); } Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId); FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION); FileSystem.mkdirs(FileSystem.get(jobDirectory.toUri(), conf), jobDirectory, mapredSysPerms); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_" + NAME + "_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_" + NAME + "_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_" + NAME + "_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_" + NAME + "_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory. final boolean special = (args.srcs.size() == 1 && !dstExists); int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_" + NAME + "_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_" + NAME + "_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("sourcePathsCount=" + srcCount); LOG.info("filesToExecCount=" + fileCount); LOG.info("bytesToExecCount=" + StringUtils.humanReadableInt(byteCount)); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(fileCount, jobConf); return fileCount > 0; }
From source file:com.linkedin.mapred.AbstractAvroJob.java
License:Open Source License
/** * Sets up various standard settings in the JobConf. You probably don't want to mess with this. * /*from w ww . ja va 2 s . co m*/ * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ protected JobConf createJobConf() throws IOException, URISyntaxException { JobConf conf = new JobConf(); conf.setJobName(getJobId()); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setDeflateLevel(conf, 9); String hadoop_ugi = _config.getString("hadoop.job.ugi", null); if (hadoop_ugi != null) { conf.set("hadoop.job.ugi", hadoop_ugi); } if (_config.getBoolean("is.local", false)) { conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); conf.set("mapred.local.dir", "/tmp/map-red"); _log.info("Running locally, no hadoop jar set."); } // set JVM options if present if (_config.containsKey("mapred.child.java.opts")) { conf.set("mapred.child.java.opts", _config.getString("mapred.child.java.opts")); _log.info("mapred.child.java.opts set to " + _config.getString("mapred.child.java.opts")); } if (_config.containsKey(INPUT_PATHS)) { List<String> inputPathnames = _config.getStringList(INPUT_PATHS); for (String pathname : inputPathnames) { AvroUtils.addAllSubPaths(conf, new Path(pathname)); } AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf)); } if (_config.containsKey(OUTPUT_PATH)) { Path path = new Path(_config.get(OUTPUT_PATH)); AvroOutputFormat.setOutputPath(conf, path); if (_config.getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } } // set all hadoop configs for (String key : _config.keySet()) { String lowerCase = key.toLowerCase(); if (lowerCase.startsWith(HADOOP_PREFIX)) { String newKey = key.substring(HADOOP_PREFIX.length()); conf.set(newKey, _config.get(key)); } } return conf; }