Example usage for org.apache.hadoop.mapred JobConf set

List of usage examples for org.apache.hadoop.mapred JobConf set

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf set.

Prototype

public void set(String name, String value) 

Source Link

Document

Set the value of the name property.

Usage

From source file:com.ebay.erl.mobius.core.JobSetup.java

License:Apache License

/**
 * specify the columns that a mapper needs to emit.
 *//*from  ww  w. j  a  v a2s .  c om*/
public static void setupProjections(JobConf job, Dataset dataset, byte datasetID, Column... projections) {
    StringBuffer sortedColumns = new StringBuffer();

    // dedupe the projection input column name and then sort it.

    Set<String> uniqueColumnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);

    for (Column aProjection : projections) {
        uniqueColumnNames.add(aProjection.getInputColumnName());
    }

    Iterator<String> it = uniqueColumnNames.iterator();
    while (it.hasNext()) {
        sortedColumns.append(it.next());
        if (it.hasNext())
            sortedColumns.append(",");
    }
    job.set(datasetID + ".value.columns", sortedColumns.toString());

    // for Mapper only task
    StringBuffer originalOrder = new StringBuffer();
    for (int i = 0; i < projections.length; i++) {
        originalOrder.append(projections[i].getInputColumnName());
        if (i < projections.length - 1)
            originalOrder.append(",");
    }
    job.set(datasetID + ".columns.in.original.order", originalOrder.toString());
}

From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java

License:Apache License

@Override
protected synchronized void submit() {
    JobConf jobConf = this.getJobConf();
    boolean isLocalHadoop = jobConf.get("mapred.job.tracker", "local").equals("local");

    // the default partitioner is {@link com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner}
    // which is hash based.
    ////from  w  w  w  .  java2 s . com
    // If user choose to use even partitioner, Mobius will use
    // {@link com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner} which
    // is sampling based partitioner of attempting to balance the load
    // for each reducer.
    String partitioner = jobConf.get("mobius.partitioner", "default");

    if (!isLocalHadoop && jobConf.getNumReduceTasks() != 0 && partitioner.equals("even")) {
        // this job needs reducer, perform sampling on the keys to 
        // make load on reducers are almost evenly distributed.

        double freq = jobConf.getFloat("mobius.sampler.freq", 0.1F);
        int numSamples = jobConf.getInt("mobius.sampler.num.samples", 50000);
        int maxSplits = jobConf.getInt("mobius.sampler.max.slipts.sampled", 5);

        // log sampling parameters so that user knows.
        LOGGER.info("Sampling parameters { " + "mobius.sampler.freq:" + format.format(freq) + ", "
                + "mobius.sampler.num.samples:" + numSamples + ", " + "mobius.sampler.max.slipts.sampled:"
                + maxSplits + "}");

        InputSampler.Sampler<?, ?> sampler = new MobiusInputSampler(freq, numSamples, maxSplits);

        writePartitionFile(jobConf, sampler);

        // add to distributed cache
        try {
            URI partitionUri = new URI(TotalOrderPartitioner.getPartitionFile(jobConf) + "#_partitions");
            LOGGER.info("Adding partition uri to distributed cache:" + partitionUri.toString());

            DistributedCache.addCacheFile(partitionUri, jobConf);
            DistributedCache.createSymlink(jobConf);
            jobConf.setPartitionerClass(EvenlyPartitioner.class);

            LOGGER.info("Using " + EvenlyPartitioner.class.getCanonicalName()
                    + " to partiton the keys evenly among reducers.");
        } catch (URISyntaxException e) {
            LOGGER.error(e.getMessage(), e);
            throw new RuntimeException(e);
        }

        // adding -XX:-UseParallelOldGC, this will automatically set -XX:-UseParallelGC
        // according to Oracle's specification
        String jvmOpts = jobConf.get("mapred.child.java.opts", "");
        if (jvmOpts.isEmpty()) {
            jvmOpts = "-XX:-UseParallelOldGC";
        } else {
            if (jvmOpts.indexOf("-XX:-UseParallelOldGC") < 0) {
                // remove "
                jvmOpts = jvmOpts.replaceAll("\"", "");
                jvmOpts = jvmOpts.concat(" -XX:-UseParallelOldGC");
            }
        }
        jobConf.set("mapred.child.java.opts", jvmOpts);

        this.setJobConf(jobConf);
    }
    LOGGER.info("Submiting job:" + jobConf.getJobName());
    super.submit();
}

From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java

License:Apache License

@Override
public Object[] getSample(InputFormat inf, JobConf job) throws IOException {
    // the following codes are copied from {@link InputSampler#RandomSampler},
    // but require some modifications.

    InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
    ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples);
    int splitsToSample = Math.min(this.maxSplitsSampled, splits.length);

    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);/*from w w w . j a v  a 2s  .co m*/

    // get Sorters
    Sorter[] sorters = null;
    if (job.get(ConfigureConstants.SORTERS, null) != null) {
        // total sort job
        sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job);
    } else {
        // there is no sorter, should be reducer/join job
        Column[] keys = (Column[]) SerializableUtil
                .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job);
        sorters = new Sorter[keys.length];
        for (int i = 0; i < keys.length; i++) {
            sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC);
        }
    }

    long proportion = 10L;
    while ((int) (this.freq * proportion) == 0) {
        proportion = proportion * 10;
    }
    proportion = 5L * proportion;

    // shuffle splits
    for (int i = 0; i < splits.length; ++i) {
        InputSplit tmp = splits[i];
        int j = r.nextInt(splits.length);
        splits[i] = splits[j];
        splits[j] = tmp;
    }

    SamplingOutputCollector collector = new SamplingOutputCollector();
    for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) {
        LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size());

        RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job,
                Reporter.NULL);
        WritableComparable key = reader.createKey();
        WritableComparable value = reader.createValue();

        if (!(inf instanceof MobiusDelegatingInputFormat)) {
            // not mobius delegating input format, so the CURRENT_DATASET_ID
            // will not be set by inf#getRecordReader, we set them here.
            //
            // set the current dataset id, as the AbstractMobiusMapper#configure
            // method needs this property.
            job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS));
        }

        Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID));
        LOGGER.info("Samples coming from dataset: " + datasetID.toString());
        AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job);
        mapper.configure(job);

        // reading elements from one split
        long readElement = 0;
        while (reader.next(key, value)) {
            collector.clear();
            Tuple tuple = mapper.parse(key, value);

            readElement++;
            if (readElement > (((long) numSamples) * ((long) proportion))) {
                // a split might be very big (ex: a large gz file),
                // so we just need to read the 
                break;
            }

            if (r.nextDouble() <= freq) {
                if (samples.size() < numSamples) {
                    mapper.joinmap(key, value, collector, Reporter.NULL);
                    // joinmap function might generate more than one output key
                    // per <code>key</code> input. 
                    for (Tuple t : collector.getOutKey()) {
                        Tuple mt = Tuple.merge(tuple, t);
                        DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
                        samples.add(nkey);
                    }
                } else {
                    // When exceeding the maximum number of samples, replace
                    // a random element with this one, then adjust the
                    // frequency to reflect the possibility of existing 
                    // elements being pushed out

                    mapper.joinmap(key, value, collector, Reporter.NULL);
                    for (Tuple t : collector.getOutKey()) {
                        int ind = r.nextInt(numSamples);
                        if (ind != numSamples) {
                            Tuple mt = Tuple.merge(tuple, t);
                            DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
                            samples.set(ind, nkey);
                        }
                    }

                    freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples;
                }
                key = reader.createKey();
                value = reader.createValue();
            }
        }
        reader.close();
    }
    LOGGER.info("Samples have been collected, return.");
    return samples.toArray();
}

From source file:com.ebay.erl.mobius.core.mapred.MobiusMultiInputs.java

License:Apache License

public static void addInputPath(JobConf conf, Path anInput, Class<? extends InputFormat> inputFormatClass,
        Class<? extends AbstractMobiusMapper> mapperClass, byte datasetID, FileSystem fs) throws IOException {
    MultipleInputs.addInputPath(conf, anInput, inputFormatClass, mapperClass);

    // override the {@link InputFormat} class set by the {@link MultipleInputs}
    // as Mobius need to set the set the current dataset id per input split.
    conf.setInputFormat(MobiusDelegatingInputFormat.class);

    // MobiusDelegatingInputFormat extends DelegatingInputFormat, which always
    // call the FileInpupt#setInputs within DelegatingInputFormat#getInputs
    // regardless of the actual type of <code>inputFormatClass</code>.

    /////////////////////////////////////////////////////
    // start to build the path to dataset ID mapping
    /////////////////////////////////////////////////////
    MultiInputsHelper helper = MultiInputsHelpersRepository.getInstance(conf).getHelper(inputFormatClass);
    URI uri = helper.getUniquePathByInputFormat(conf, anInput);
    String aPath = uri.toString();

    if (aPath.indexOf(";") >= 0)
        throw new IllegalArgumentException(aPath + " cannot contains semicolon");

    // set the input path to datasetID mapping in the Hadoop configuration.
    if (conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, "").isEmpty()) {
        conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath);
    } else {/*from  w  w w.j  a  v  a 2s. c o m*/
        String previous = conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING);
        conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath + "," + previous);
    }

    //LOGGER.debug(conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, ""));
}

From source file:com.ebay.erl.mobius.core.mapred.MultiInputsHelpersRepository.java

License:Apache License

public void writeToConf(JobConf conf) {
    Iterator<Entry<Class<? extends InputFormat>, MultiInputsHelper>> entries = this.mapping.entrySet()
            .iterator();/*from  w  w w  .  j  a  v a  2s. com*/

    while (entries.hasNext()) {
        Entry<Class<? extends InputFormat>, MultiInputsHelper> anEntry = entries.next();
        Class<? extends InputFormat> inputFormat = anEntry.getKey();
        Class<? extends MultiInputsHelper> helper = anEntry.getValue().getClass();

        if (!conf.get("mobius.multi.inputs.helpers", "").isEmpty()) {
            String others = conf.get("mobius.multi.inputs.helpers");
            conf.set("mobius.multi.inputs.helpers",
                    others + "," + inputFormat.getCanonicalName() + ":" + helper.getCanonicalName());
        } else {
            conf.set("mobius.multi.inputs.helpers",
                    inputFormat.getCanonicalName() + ":" + helper.getCanonicalName());
        }
    }
}

From source file:com.ebay.erl.mobius.core.MobiusJob.java

License:Apache License

/**
 * Select the <code>columns</code> from the <code>dataset</code>, store
 * it into <code>outputFolder</code> with the given <code>outputFormat</code>
 * <p>/* w  w w  .  j ava  2  s.c o  m*/
 * 
 * Here is an example:
 * <pre>
 * <code>
 * public MyJob extends MobiusJob
 * {
 *    public void run(String[] args)
 *    {
 *       Dataset students = ...;
 *       
 *       // save the result to $OUTPUT in SequenceFileOutputFormat,
 *       // the key will be NullWritable, and the value is a Tuple 
 *       // which contains 3 columns, id, f_name and l_name.
 *       this.list(students,
 *          new Path("$OUTPUT"),
 *          SequenceFileOutputFormat.class,
 *          new Column(students, "id"),
 *          new Column(students, "f_name"),
 *          new Column(students, "l_name")
 *       ); 
 *    }
 *    
 *    public static void main(String[] args) throw Exception
 *    {
 *       System.exit(MobiusJobRunner.run(new MyJob(), args));
 *    }
 * }
 * </code>
 * </pre>
 */
public Dataset list(Dataset dataset, Path outputFolder, Class<? extends FileOutputFormat> outputFormat,
        Column... columns) throws IOException {
    byte datasetID = 0;// set to 0 as there is only one dataset to be operated.

    JobConf job = dataset.createJobConf(datasetID);

    job.set("mapred.job.name", "Listing " + dataset.getName());
    job.setJarByClass(this.getClass());
    job.setNumReduceTasks(0); // list is map only job
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Tuple.class);
    job.setJobName("List " + dataset.getName());

    JobSetup.validateColumns(dataset, columns);
    JobSetup.setupInputs(job, dataset, datasetID);
    JobSetup.setupProjections(job, dataset, datasetID, columns);
    JobSetup.setupOutputs(job, outputFolder, outputFormat);

    this.addToExecQueue(job);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(this).getBuilder(outputFormat,
            "Dataset_" + outputFolder.getName());
    return builder.buildFromPreviousJob(job, outputFormat, Column.toSchemaArray(columns));
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testMROutput2() throws Exception {
    JobConf job = new JobConf(conf);
    // Test that you can set the output directory using this config
    job.set("mapred.work.output.dir", testFilePath.getParent().toString());
    Properties properties = new Properties();
    StructObjectInspector inspector;//from w  w  w .ja  v  a 2  s  .  c o m
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    SerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "col");
    properties.setProperty("columns.types", "string");
    serde.initialize(conf, properties);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    reader.next(key, value);
    assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector())
            .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    reader.close();

}

From source file:com.facebook.presto.hive.AbstractTestHiveFileFormats.java

License:Apache License

public FileSplit createTestFile(String filePath, HiveOutputFormat<?, ?> outputFormat,
        @SuppressWarnings("deprecation") SerDe serDe, String compressionCodec) throws Exception {
    JobConf jobConf = new JobConf();
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", COLUMN_NAMES_STRING);
    tableProperties.setProperty("columns.types", COLUMN_TYPES);
    serDe.initialize(new Configuration(), tableProperties);

    if (compressionCodec != null) {
        CompressionCodec codec = new CompressionCodecFactory(new Configuration())
                .getCodecByName(compressionCodec);
        jobConf.set(COMPRESS_CODEC, codec.getClass().getName());
        jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString());
    }//from   ww  w.j  av a  2s .  com

    RecordWriter recordWriter = outputFormat.getHiveRecordWriter(jobConf, new Path(filePath), Text.class,
            compressionCodec != null, tableProperties, new Progressable() {
                @Override
                public void progress() {
                }
            });

    try {
        serDe.initialize(new Configuration(), tableProperties);

        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(COLUMN_NAMES,
                FIELD_INSPECTORS);
        Object row = objectInspector.create();

        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());

        for (int rowNumber = 0; rowNumber < NUM_ROWS; rowNumber++) {
            for (int i = 0; i < TEST_VALUES.size(); i++) {
                Object key = TEST_VALUES.get(i).getKey();
                if (key instanceof Slice) {
                    key = ((Slice) key).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), key);
            }

            Writable record = serDe.serialize(row, objectInspector);
            recordWriter.write(record);
        }
    } finally {
        recordWriter.close(false);
    }

    Path path = new Path(filePath);
    path.getFileSystem(new Configuration()).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}

From source file:com.facebook.presto.hive.BenchmarkHiveFileFormats.java

License:Apache License

public static RecordWriter createRecordWriter(List<? extends TpchColumn<?>> columns, File outputFile,
        HiveOutputFormat<?, ?> outputFormat, CompressionType compressionCodec) throws Exception {
    JobConf jobConf = new JobConf();
    ReaderWriterProfiler.setProfilerOptions(jobConf);
    if (compressionCodec != CompressionType.none) {
        CompressionCodec codec = new CompressionCodecFactory(new Configuration())
                .getCodecByName(compressionCodec.toString());
        jobConf.set(COMPRESS_CODEC, codec.getClass().getName());
        jobConf.set(COMPRESS_TYPE, org.apache.hadoop.io.SequenceFile.CompressionType.BLOCK.toString());
        jobConf.set("parquet.compression", compressionCodec.toString());
        jobConf.set("parquet.enable.dictionary", "true");
        switch (compressionCodec) {
        case gzip:
            jobConf.set("hive.exec.orc.default.compress", "ZLIB");
            jobConf.set("hive.exec.orc.compress", "ZLIB");
            break;
        case snappy:
            jobConf.set("hive.exec.orc.default.compress", "SNAPPY");
            jobConf.set("hive.exec.orc.compress", "SNAPPY");
            break;
        default://from   ww w.  j ava 2  s.c  om
            throw new IllegalArgumentException("Unsupported compression codec: " + compressionCodec);
        }
    } else {
        jobConf.set("parquet.enable.dictionary", "true");
        jobConf.set("hive.exec.orc.default.compress", "NONE");
        jobConf.set("hive.exec.orc.compress", "NONE");
    }

    RecordWriter recordWriter = outputFormat.getHiveRecordWriter(jobConf, new Path(outputFile.toURI()),
            Text.class, compressionCodec != CompressionType.none, createTableProperties(columns),
            new Progressable() {
                @Override
                public void progress() {
                }
            });

    return recordWriter;
}

From source file:com.facebook.presto.hive.HiveRecordSet.java

License:Apache License

private static RecordReader<?, ?> createRecordReader(HiveSplit split, Configuration configuration,
        Path wrappedPath) {//from w  ww.j av  a2  s  . c om
    final InputFormat<?, ?> inputFormat = getInputFormat(configuration, split.getSchema(), true);
    final JobConf jobConf = new JobConf(configuration);
    final FileSplit fileSplit = createFileSplit(wrappedPath, split.getStart(), split.getLength());

    // propagate serialization configuration to getRecordReader
    for (String name : split.getSchema().stringPropertyNames()) {
        if (name.startsWith("serialization.")) {
            jobConf.set(name, split.getSchema().getProperty(name));
        }
    }

    try {
        return retry().stopOnIllegalExceptions().run("createRecordReader", new Callable<RecordReader<?, ?>>() {
            @Override
            public RecordReader<?, ?> call() throws IOException {
                return inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
            }
        });
    } catch (Exception e) {
        throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT.toErrorCode(),
                String.format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s",
                        split.getPath(), split.getStart(), split.getLength(),
                        getInputFormatName(split.getSchema()), e.getMessage()),
                e);
    }
}