Example usage for org.apache.hadoop.mapred JobConf set

List of usage examples for org.apache.hadoop.mapred JobConf set

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf set.

Prototype

public void set(String name, String value) 

Source Link

Document

Set the value of the name property.

Usage

From source file:com.ibm.jaql.io.hbase.TableOutputConfigurator.java

License:Apache License

public void setParallel(JobConf conf) throws Exception {
    conf.set(TableOutputFormat.OUTPUT_TABLE, location);
    conf.setOutputKeyClass(JsonHolderDefault.class);
    conf.setOutputValueClass(JsonHolderDefault.class);
    HadoopSerializationDefault.register(conf);
    conf.setOutputKeyComparatorClass(DefaultJsonComparator.class);
}

From source file:com.ibm.jaql.lang.expr.core.RegisterExceptionHandler.java

License:Apache License

public static void writeConf(String name, JobConf conf) throws Exception {
    ThresholdExceptionHandler handler = (ThresholdExceptionHandler) JaqlUtil.getExceptionHandler();
    BufferedJsonRecord r = new BufferedJsonRecord();
    r.add(ERROR_THRESH_FIELD_NAME, new JsonLong(handler.getMaxExceptions()));
    String s = JsonUtil.printToString(r);
    conf.set(name, s);
}

From source file:com.ibm.jaql.lang.expr.system.RJaqlInterface.java

License:Apache License

/**
 * This method provides the functionality of saving simple R objects into HDFS in one of
 * the formats supported by Jaql so that it can be directly read into Jaql.
 * @param localPath//from   www  .ja  va  2 s  . c o  m
 * @param hdfsPath
 * @param schemaString
 * @param format
 * @param header
 * @param vector
 * @return
 */
public boolean jaqlSave(String localPath, String hdfsPath, String schemaString, String format, boolean header,
        boolean vector) {
    if (format.equalsIgnoreCase(FORMAT_DELIM)) {
        LOG.info("Format: " + FORMAT_DELIM + ", saving to HDFS loc: " + hdfsPath);
        return RUtil.saveToHDFS(localPath, hdfsPath);
    }
    try {
        JobConf conf = new JobConf();
        int DEFAULT_BUFFER_SIZE = 64 * 1024;
        int bufferSize = conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        BufferedReader reader = new BufferedReader(new FileReader(localPath), bufferSize);
        LongWritable key = new LongWritable(0);
        long count = 0;
        Text value = new Text();
        BufferedJsonRecord options = new BufferedJsonRecord(2);
        BufferedJsonArray headerArray = null;
        if (header) {
            String headerString = reader.readLine();
            String[] headers = splitPattern.split(headerString);
            headerArray = new BufferedJsonArray(headers.length);
            for (int i = 0; i < headers.length; i++) {
                headerArray.set(i, new JsonString(StringUtils.strip(headers[i], "\"")));
            }
            count++;
        }

        Schema schema = null;
        if (schemaString != null) {
            schema = SchemaFactory.parse(schemaString);
        }

        if (headerArray != null) {
            RecordSchema recordSchema = (RecordSchema) schema;

            // construct new matching schema
            List<Field> fields = new LinkedList<Field>();
            for (JsonValue fieldName : headerArray) {
                Field field;
                if (recordSchema == null) {
                    field = new Field((JsonString) fieldName, SchemaFactory.stringSchema(), false);
                } else {
                    field = recordSchema.getField((JsonString) fieldName);
                    if (field == null)
                        throw new NullPointerException("header field not in schema: " + fieldName);
                    // FIXME: schema fields that are not in the header are currently consider OK
                }
                fields.add(field);
            }

            // and set it
            schema = new RecordSchema(fields, null);
        }
        if (schema != null)
            options.add(DelOptionParser.SCHEMA_NAME, new JsonSchema(schema));
        KeyValueImport<LongWritable, Text> converter = null;
        if (vector) {
            converter = new FromLinesConverter();
        } else {
            converter = new FromDelConverter();
        }
        LOG.info("Initializing Converter with options: " + options);
        converter.init(options);
        Schema tmpSchema = converter.getSchema();
        tmpSchema = SchemaTransformation.removeNullability(tmpSchema);
        if (!tmpSchema.is(JsonType.ARRAY, JsonType.RECORD, JsonType.BOOLEAN, JsonType.DECFLOAT, JsonType.DOUBLE,
                JsonType.LONG, JsonType.STRING).always()) {
            throw new IOException("Unrecognized schema type: " + schema.getSchemaType());
        }
        JsonValue outValue = converter.createTarget();
        JsonHolder outKeyHolder;
        JsonHolder outValueHolder;
        if (format.equalsIgnoreCase(FORMAT_DEFAULT)) {
            HadoopSerializationDefault.register(conf);
            outKeyHolder = new JsonHolderDefault();
            outValueHolder = new JsonHolderDefault(outValue);
            LOG.info("Registered serializer for Default format.");
        } else if (format.equalsIgnoreCase(FORMAT_TEMP)) {
            // TODO: There should be a better way of doing this. HadoopSerializationTemp
            // now does it in an ugly way.
            BufferedJsonRecord tmpOptions = new BufferedJsonRecord();
            BufferedJsonRecord outOptions = new BufferedJsonRecord();
            outOptions.add(new JsonString("schema"), new JsonSchema(schema));
            tmpOptions.add(new JsonString("options"), outOptions);
            conf.set(ConfSetter.CONFOUTOPTIONS_NAME, tmpOptions.toString());
            HadoopSerializationTemp.register(conf);
            outKeyHolder = new JsonHolderTempKey(null);
            outValueHolder = new JsonHolderTempValue();
            LOG.info("Registered serializer for HadoopTemp format.");
        } else {
            throw new IOException("Unrecognized serialization format requested: " + format);
        }
        FileSystem fs = FileSystem.get(conf);
        Path outputPath = new Path(hdfsPath);
        Writer writer = SequenceFile.createWriter(fs, conf, outputPath, outKeyHolder.getClass(),
                outValueHolder.getClass());
        String line;
        while ((line = reader.readLine()) != null) {
            key.set(count++);
            value.set(line);
            outValue = converter.convert(key, value, outValue);
            outValueHolder.value = outValue;
            writer.append(outKeyHolder, outValueHolder);
        }
        LOG.info("Transferred " + count + " line(s).");
        reader.close();
        writer.close();
    } catch (IOException e) {
        LOG.info("Error in saving object.", e);
        return false;
    }
    return true;
}

From source file:com.intel.hadoop.graphbuilder.demoapps.wikipedia.WikiPageInputFormat.java

License:Open Source License

@Override
public RecordReader<LongWritable, Text> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
        throws IOException {
    conf.set(XMLInputFormat.START_TAG_KEY, START_TAG);
    conf.set(XMLInputFormat.END_TAG_KEY, END_TAG);
    return new XMLRecordReader((FileSplit) split, conf);
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.HashIdMR.java

License:Open Source License

/**
 * @param inputpath//from  ww w.jav a 2s .  co m
 *          the path to a unique vertex list. Each line is parsed into (vid,
 *          data) using {@code vidparser} and {@code vdataparser}.
 * @param outputpath
 *          the path of output directory.
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {
    JobConf conf = new JobConf(HashIdMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(HashIdMapper.class);
    conf.setReducerClass(HashIdReducer.class);

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputFormat(MultiDirOutputFormat.class);

    conf.setInt("mapred.line.input.format.linespermap", linespermap);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("VdataParser", vdataparser.getClass().getName());

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("====== Job: Create integer Id maps for vertices ==========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.debug("Lines per map = 6000000");
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("VdataParser = " + vdataparser.getClass().getName());
    LOG.info("==========================================================");
    JobClient.runJob(conf);
    LOG.info("=======================Done =====================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortDictMR.java

License:Open Source License

/**
 * @param inputpath/*from   w w w.j  a v  a  2  s  . co  m*/
 *          the path to a rawId to newId dictionary.
 * @param outputpath
 *          the path of output directory.
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(SortDictMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(SortDictMapper.class);
    conf.setReducerClass(SortDictReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setBoolean("hashRawVid", hashRawVid);
    conf.setInt("numChunks", numChunks);
    conf.set("VidParser", vidparser.getClass().getName());

    String outprefix = "vidhashmap";
    for (int i = 0; i < numChunks; i++) {
        MultipleOutputs.addNamedOutput(conf, outprefix + i, TextOutputFormat.class, Text.class, Text.class);
    }

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("========== Job: Partition the map of rawid -> id ===========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.info("======================================================");
    if (hashRawVid)
        LOG.info("Partition on rawId.");
    else
        LOG.info("Partition on newId");
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    JobClient.runJob(conf);
    LOG.info("======================= Done ==========================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR.java

License:Open Source License

public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(SortEdgeMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(SortEdgeMapper.class);
    conf.setReducerClass(SortEdgeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInt("numChunks", numChunks);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("==== Job: Partition the input edges by hash(sourceid) =========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("EdataParser = " + edataparser.getClass().getName());
    LOG.info("===============================================================");

    JobClient.runJob(conf);//from   w  ww .j ava  2  s .c om
    LOG.info("=================== Done ====================================\n");
}

From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.TransEdgeMR.java

License:Open Source License

/**
 * @param inputpath/*w w w . j a  va  2s  .  c  om*/
 *          path of the partitioned edge list
 * @param outputpath
 *          path of the output directory
 * @throws IOException
 */
public void run(String inputpath, String outputpath) throws IOException {

    JobConf conf = new JobConf(TransEdgeMR.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(TransEdgeMapper.class);
    conf.setReducerClass(TransEdgeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setInt("numChunks", numChunks);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());

    conf.set("dictionaryPath", dictionaryPath);

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    LOG.info("============= Job: Normalize Ids in Edges ====================");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.info("Dictionary = " + dictionaryPath);
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("EdataParser = " + edataparser.getClass().getName());
    LOG.info("===============================================================");

    JobClient.runJob(conf);

    LOG.info("========================= Done ===============================");
}

From source file:com.kadwa.hadoop.DistExec.java

License:Open Source License

/**
 * Initialize ExecFilesMapper specific job-configuration.
 *
 * @param conf    : The dfs/mapred configuration.
 * @param jobConf : The handle to the jobConf object to be initialized.
 * @param args    Arguments// w  w w.  ja  va2s .c  o m
 * @return true if it is necessary to launch a job.
 */
private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());
    jobConf.set(EXEC_CMD_LABEL, args.execCmd);

    //set boolean values
    jobConf.setBoolean(Options.REDIRECT_ERROR_TO_OUT.propertyname,
            args.flags.contains(Options.REDIRECT_ERROR_TO_OUT));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path stagingArea;
    try {
        stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }

    Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId);
    FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(FileSystem.get(jobDirectory.toUri(), conf), jobDirectory, mapredSysPerms);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    FileSystem dstfs = args.dst.getFileSystem(conf);

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf);

    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
        dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
        String filename = "_" + NAME + "_logs_" + randomId;
        if (!dstExists || !dstIsDir) {
            Path parent = args.dst.getParent();
            if (!dstfs.exists(parent)) {
                dstfs.mkdirs(parent);
            }
            logPath = new Path(parent, filename);
        } else {
            logPath = new Path(args.dst, filename);
        }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_" + NAME + "_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_" + NAME + "_dst_files");
    SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class,
            Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_" + NAME + "_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists);
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
        for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) {
            final Path src = srcItr.next();
            FileSystem srcfs = src.getFileSystem(conf);
            FileStatus srcfilestat = srcfs.getFileStatus(src);
            Path root = special && srcfilestat.isDir() ? src : src.getParent();
            if (srcfilestat.isDir()) {
                ++srcCount;
            }

            Stack<FileStatus> pathstack = new Stack<FileStatus>();
            for (pathstack.push(srcfilestat); !pathstack.empty();) {
                FileStatus cur = pathstack.pop();
                FileStatus[] children = srcfs.listStatus(cur.getPath());
                for (int i = 0; i < children.length; i++) {
                    boolean skipfile = false;
                    final FileStatus child = children[i];
                    final String dst = makeRelative(root, child.getPath());
                    ++srcCount;

                    if (child.isDir()) {
                        pathstack.push(child);
                    } else {

                        if (!skipfile) {
                            ++fileCount;
                            byteCount += child.getLen();

                            if (LOG.isTraceEnabled()) {
                                LOG.trace("adding file " + child.getPath());
                            }

                            ++cnsyncf;
                            cbsyncs += child.getLen();
                            if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
                                src_writer.sync();
                                dst_writer.sync();
                                cnsyncf = 0;
                                cbsyncs = 0L;
                            }
                        }
                    }

                    if (!skipfile) {
                        src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()),
                                new FilePair(child, dst));
                    }

                    dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
                }

                if (cur.isDir()) {
                    String dst = makeRelative(root, cur.getPath());
                    dir_writer.append(new Text(dst), new FilePair(cur, dst));
                    if (++dirsyn > SYNC_FILE_MAX) {
                        dirsyn = 0;
                        dir_writer.sync();
                    }
                }
            }
        }
    } finally {
        checkAndClose(src_writer);
        checkAndClose(dst_writer);
        checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
        dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
        LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
        if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
            throw new IOException("Failed to create" + args.dst);
        }
    }

    final Path sorted = new Path(jobDirectory, "_" + NAME + "_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    Path tmpDir = new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst,
            "_" + NAME + "_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
    LOG.info("sourcePathsCount=" + srcCount);
    LOG.info("filesToExecCount=" + fileCount);
    LOG.info("bytesToExecCount=" + StringUtils.humanReadableInt(byteCount));
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(fileCount, jobConf);
    return fileCount > 0;
}

From source file:com.linkedin.mapred.AbstractAvroJob.java

License:Open Source License

/**
 * Sets up various standard settings in the JobConf. You probably don't want to mess with this.
 * /*from w  ww  . ja va 2 s  .  co m*/
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
protected JobConf createJobConf() throws IOException, URISyntaxException {
    JobConf conf = new JobConf();

    conf.setJobName(getJobId());
    conf.setInputFormat(AvroInputFormat.class);
    conf.setOutputFormat(AvroOutputFormat.class);

    AvroOutputFormat.setDeflateLevel(conf, 9);

    String hadoop_ugi = _config.getString("hadoop.job.ugi", null);
    if (hadoop_ugi != null) {
        conf.set("hadoop.job.ugi", hadoop_ugi);
    }
    if (_config.getBoolean("is.local", false)) {
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "file:///");
        conf.set("mapred.local.dir", "/tmp/map-red");

        _log.info("Running locally, no hadoop jar set.");
    }

    // set JVM options if present
    if (_config.containsKey("mapred.child.java.opts")) {
        conf.set("mapred.child.java.opts", _config.getString("mapred.child.java.opts"));
        _log.info("mapred.child.java.opts set to " + _config.getString("mapred.child.java.opts"));
    }

    if (_config.containsKey(INPUT_PATHS)) {
        List<String> inputPathnames = _config.getStringList(INPUT_PATHS);
        for (String pathname : inputPathnames) {
            AvroUtils.addAllSubPaths(conf, new Path(pathname));
        }
        AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf));
    }

    if (_config.containsKey(OUTPUT_PATH)) {
        Path path = new Path(_config.get(OUTPUT_PATH));
        AvroOutputFormat.setOutputPath(conf, path);

        if (_config.getBoolean("force.output.overwrite", false)) {
            FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
            fs.delete(FileOutputFormat.getOutputPath(conf), true);
        }
    }
    // set all hadoop configs
    for (String key : _config.keySet()) {
        String lowerCase = key.toLowerCase();
        if (lowerCase.startsWith(HADOOP_PREFIX)) {
            String newKey = key.substring(HADOOP_PREFIX.length());
            conf.set(newKey, _config.get(key));
        }
    }
    return conf;
}