List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:com.ebay.erl.mobius.core.JobSetup.java
License:Apache License
/** * specify the columns that a mapper needs to emit. *//*from ww w. j a v a2s . c om*/ public static void setupProjections(JobConf job, Dataset dataset, byte datasetID, Column... projections) { StringBuffer sortedColumns = new StringBuffer(); // dedupe the projection input column name and then sort it. Set<String> uniqueColumnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); for (Column aProjection : projections) { uniqueColumnNames.add(aProjection.getInputColumnName()); } Iterator<String> it = uniqueColumnNames.iterator(); while (it.hasNext()) { sortedColumns.append(it.next()); if (it.hasNext()) sortedColumns.append(","); } job.set(datasetID + ".value.columns", sortedColumns.toString()); // for Mapper only task StringBuffer originalOrder = new StringBuffer(); for (int i = 0; i < projections.length; i++) { originalOrder.append(projections[i].getInputColumnName()); if (i < projections.length - 1) originalOrder.append(","); } job.set(datasetID + ".columns.in.original.order", originalOrder.toString()); }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
@Override protected synchronized void submit() { JobConf jobConf = this.getJobConf(); boolean isLocalHadoop = jobConf.get("mapred.job.tracker", "local").equals("local"); // the default partitioner is {@link com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner} // which is hash based. ////from w w w . java2 s . com // If user choose to use even partitioner, Mobius will use // {@link com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner} which // is sampling based partitioner of attempting to balance the load // for each reducer. String partitioner = jobConf.get("mobius.partitioner", "default"); if (!isLocalHadoop && jobConf.getNumReduceTasks() != 0 && partitioner.equals("even")) { // this job needs reducer, perform sampling on the keys to // make load on reducers are almost evenly distributed. double freq = jobConf.getFloat("mobius.sampler.freq", 0.1F); int numSamples = jobConf.getInt("mobius.sampler.num.samples", 50000); int maxSplits = jobConf.getInt("mobius.sampler.max.slipts.sampled", 5); // log sampling parameters so that user knows. LOGGER.info("Sampling parameters { " + "mobius.sampler.freq:" + format.format(freq) + ", " + "mobius.sampler.num.samples:" + numSamples + ", " + "mobius.sampler.max.slipts.sampled:" + maxSplits + "}"); InputSampler.Sampler<?, ?> sampler = new MobiusInputSampler(freq, numSamples, maxSplits); writePartitionFile(jobConf, sampler); // add to distributed cache try { URI partitionUri = new URI(TotalOrderPartitioner.getPartitionFile(jobConf) + "#_partitions"); LOGGER.info("Adding partition uri to distributed cache:" + partitionUri.toString()); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); jobConf.setPartitionerClass(EvenlyPartitioner.class); LOGGER.info("Using " + EvenlyPartitioner.class.getCanonicalName() + " to partiton the keys evenly among reducers."); } catch (URISyntaxException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } // adding -XX:-UseParallelOldGC, this will automatically set -XX:-UseParallelGC // according to Oracle's specification String jvmOpts = jobConf.get("mapred.child.java.opts", ""); if (jvmOpts.isEmpty()) { jvmOpts = "-XX:-UseParallelOldGC"; } else { if (jvmOpts.indexOf("-XX:-UseParallelOldGC") < 0) { // remove " jvmOpts = jvmOpts.replaceAll("\"", ""); jvmOpts = jvmOpts.concat(" -XX:-UseParallelOldGC"); } } jobConf.set("mapred.child.java.opts", jvmOpts); this.setJobConf(jobConf); } LOGGER.info("Submiting job:" + jobConf.getJobName()); super.submit(); }
From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java
License:Apache License
@Override public Object[] getSample(InputFormat inf, JobConf job) throws IOException { // the following codes are copied from {@link InputSampler#RandomSampler}, // but require some modifications. InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples); int splitsToSample = Math.min(this.maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed);/*from w w w . j a v a 2s .co m*/ // get Sorters Sorter[] sorters = null; if (job.get(ConfigureConstants.SORTERS, null) != null) { // total sort job sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job); } else { // there is no sorter, should be reducer/join job Column[] keys = (Column[]) SerializableUtil .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job); sorters = new Sorter[keys.length]; for (int i = 0; i < keys.length; i++) { sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC); } } long proportion = 10L; while ((int) (this.freq * proportion) == 0) { proportion = proportion * 10; } proportion = 5L * proportion; // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } SamplingOutputCollector collector = new SamplingOutputCollector(); for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) { LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size()); RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); WritableComparable key = reader.createKey(); WritableComparable value = reader.createValue(); if (!(inf instanceof MobiusDelegatingInputFormat)) { // not mobius delegating input format, so the CURRENT_DATASET_ID // will not be set by inf#getRecordReader, we set them here. // // set the current dataset id, as the AbstractMobiusMapper#configure // method needs this property. job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS)); } Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID)); LOGGER.info("Samples coming from dataset: " + datasetID.toString()); AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job); mapper.configure(job); // reading elements from one split long readElement = 0; while (reader.next(key, value)) { collector.clear(); Tuple tuple = mapper.parse(key, value); readElement++; if (readElement > (((long) numSamples) * ((long) proportion))) { // a split might be very big (ex: a large gz file), // so we just need to read the break; } if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { mapper.joinmap(key, value, collector, Reporter.NULL); // joinmap function might generate more than one output key // per <code>key</code> input. for (Tuple t : collector.getOutKey()) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.add(nkey); } } else { // When exceeding the maximum number of samples, replace // a random element with this one, then adjust the // frequency to reflect the possibility of existing // elements being pushed out mapper.joinmap(key, value, collector, Reporter.NULL); for (Tuple t : collector.getOutKey()) { int ind = r.nextInt(numSamples); if (ind != numSamples) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.set(ind, nkey); } } freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples; } key = reader.createKey(); value = reader.createValue(); } } reader.close(); } LOGGER.info("Samples have been collected, return."); return samples.toArray(); }
From source file:com.ebay.erl.mobius.core.mapred.MobiusMultiInputs.java
License:Apache License
public static void addInputPath(JobConf conf, Path anInput, Class<? extends InputFormat> inputFormatClass, Class<? extends AbstractMobiusMapper> mapperClass, byte datasetID, FileSystem fs) throws IOException { MultipleInputs.addInputPath(conf, anInput, inputFormatClass, mapperClass); // override the {@link InputFormat} class set by the {@link MultipleInputs} // as Mobius need to set the set the current dataset id per input split. conf.setInputFormat(MobiusDelegatingInputFormat.class); // MobiusDelegatingInputFormat extends DelegatingInputFormat, which always // call the FileInpupt#setInputs within DelegatingInputFormat#getInputs // regardless of the actual type of <code>inputFormatClass</code>. ///////////////////////////////////////////////////// // start to build the path to dataset ID mapping ///////////////////////////////////////////////////// MultiInputsHelper helper = MultiInputsHelpersRepository.getInstance(conf).getHelper(inputFormatClass); URI uri = helper.getUniquePathByInputFormat(conf, anInput); String aPath = uri.toString(); if (aPath.indexOf(";") >= 0) throw new IllegalArgumentException(aPath + " cannot contains semicolon"); // set the input path to datasetID mapping in the Hadoop configuration. if (conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, "").isEmpty()) { conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath); } else {/*from w w w.j a v a 2s. c o m*/ String previous = conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING); conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath + "," + previous); } //LOGGER.debug(conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, "")); }
From source file:com.ebay.erl.mobius.core.mapred.MultiInputsHelpersRepository.java
License:Apache License
public void writeToConf(JobConf conf) { Iterator<Entry<Class<? extends InputFormat>, MultiInputsHelper>> entries = this.mapping.entrySet() .iterator();/*from w w w . j a v a 2s. com*/ while (entries.hasNext()) { Entry<Class<? extends InputFormat>, MultiInputsHelper> anEntry = entries.next(); Class<? extends InputFormat> inputFormat = anEntry.getKey(); Class<? extends MultiInputsHelper> helper = anEntry.getValue().getClass(); if (!conf.get("mobius.multi.inputs.helpers", "").isEmpty()) { String others = conf.get("mobius.multi.inputs.helpers"); conf.set("mobius.multi.inputs.helpers", others + "," + inputFormat.getCanonicalName() + ":" + helper.getCanonicalName()); } else { conf.set("mobius.multi.inputs.helpers", inputFormat.getCanonicalName() + ":" + helper.getCanonicalName()); } } }
From source file:com.ebay.erl.mobius.core.MobiusJob.java
License:Apache License
/** * Select the <code>columns</code> from the <code>dataset</code>, store * it into <code>outputFolder</code> with the given <code>outputFormat</code> * <p>/* w w w . j ava 2 s.c o m*/ * * Here is an example: * <pre> * <code> * public MyJob extends MobiusJob * { * public void run(String[] args) * { * Dataset students = ...; * * // save the result to $OUTPUT in SequenceFileOutputFormat, * // the key will be NullWritable, and the value is a Tuple * // which contains 3 columns, id, f_name and l_name. * this.list(students, * new Path("$OUTPUT"), * SequenceFileOutputFormat.class, * new Column(students, "id"), * new Column(students, "f_name"), * new Column(students, "l_name") * ); * } * * public static void main(String[] args) throw Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> */ public Dataset list(Dataset dataset, Path outputFolder, Class<? extends FileOutputFormat> outputFormat, Column... columns) throws IOException { byte datasetID = 0;// set to 0 as there is only one dataset to be operated. JobConf job = dataset.createJobConf(datasetID); job.set("mapred.job.name", "Listing " + dataset.getName()); job.setJarByClass(this.getClass()); job.setNumReduceTasks(0); // list is map only job job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Tuple.class); job.setJobName("List " + dataset.getName()); JobSetup.validateColumns(dataset, columns); JobSetup.setupInputs(job, dataset, datasetID); JobSetup.setupProjections(job, dataset, datasetID, columns); JobSetup.setupOutputs(job, outputFolder, outputFormat); this.addToExecQueue(job); AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(this).getBuilder(outputFormat, "Dataset_" + outputFolder.getName()); return builder.buildFromPreviousJob(job, outputFormat, Column.toSchemaArray(columns)); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testMROutput2() throws Exception { JobConf job = new JobConf(conf); // Test that you can set the output directory using this config job.set("mapred.work.output.dir", testFilePath.getParent().toString()); Properties properties = new Properties(); StructObjectInspector inspector;//from w w w .ja v a 2 s . c o m synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "col"); properties.setProperty("columns.types", "string"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); reader.next(key, value); assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector()) .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); reader.close(); }
From source file:com.facebook.presto.hive.AbstractTestHiveFileFormats.java
License:Apache License
public FileSplit createTestFile(String filePath, HiveOutputFormat<?, ?> outputFormat, @SuppressWarnings("deprecation") SerDe serDe, String compressionCodec) throws Exception { JobConf jobConf = new JobConf(); Properties tableProperties = new Properties(); tableProperties.setProperty("columns", COLUMN_NAMES_STRING); tableProperties.setProperty("columns.types", COLUMN_TYPES); serDe.initialize(new Configuration(), tableProperties); if (compressionCodec != null) { CompressionCodec codec = new CompressionCodecFactory(new Configuration()) .getCodecByName(compressionCodec); jobConf.set(COMPRESS_CODEC, codec.getClass().getName()); jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString()); }//from ww w.j av a 2s . com RecordWriter recordWriter = outputFormat.getHiveRecordWriter(jobConf, new Path(filePath), Text.class, compressionCodec != null, tableProperties, new Progressable() { @Override public void progress() { } }); try { serDe.initialize(new Configuration(), tableProperties); SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(COLUMN_NAMES, FIELD_INSPECTORS); Object row = objectInspector.create(); List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs()); for (int rowNumber = 0; rowNumber < NUM_ROWS; rowNumber++) { for (int i = 0; i < TEST_VALUES.size(); i++) { Object key = TEST_VALUES.get(i).getKey(); if (key instanceof Slice) { key = ((Slice) key).getBytes(); } objectInspector.setStructFieldData(row, fields.get(i), key); } Writable record = serDe.serialize(row, objectInspector); recordWriter.write(record); } } finally { recordWriter.close(false); } Path path = new Path(filePath); path.getFileSystem(new Configuration()).setVerifyChecksum(true); File file = new File(filePath); return new FileSplit(path, 0, file.length(), new String[0]); }
From source file:com.facebook.presto.hive.BenchmarkHiveFileFormats.java
License:Apache License
public static RecordWriter createRecordWriter(List<? extends TpchColumn<?>> columns, File outputFile, HiveOutputFormat<?, ?> outputFormat, CompressionType compressionCodec) throws Exception { JobConf jobConf = new JobConf(); ReaderWriterProfiler.setProfilerOptions(jobConf); if (compressionCodec != CompressionType.none) { CompressionCodec codec = new CompressionCodecFactory(new Configuration()) .getCodecByName(compressionCodec.toString()); jobConf.set(COMPRESS_CODEC, codec.getClass().getName()); jobConf.set(COMPRESS_TYPE, org.apache.hadoop.io.SequenceFile.CompressionType.BLOCK.toString()); jobConf.set("parquet.compression", compressionCodec.toString()); jobConf.set("parquet.enable.dictionary", "true"); switch (compressionCodec) { case gzip: jobConf.set("hive.exec.orc.default.compress", "ZLIB"); jobConf.set("hive.exec.orc.compress", "ZLIB"); break; case snappy: jobConf.set("hive.exec.orc.default.compress", "SNAPPY"); jobConf.set("hive.exec.orc.compress", "SNAPPY"); break; default://from ww w. j ava 2 s.c om throw new IllegalArgumentException("Unsupported compression codec: " + compressionCodec); } } else { jobConf.set("parquet.enable.dictionary", "true"); jobConf.set("hive.exec.orc.default.compress", "NONE"); jobConf.set("hive.exec.orc.compress", "NONE"); } RecordWriter recordWriter = outputFormat.getHiveRecordWriter(jobConf, new Path(outputFile.toURI()), Text.class, compressionCodec != CompressionType.none, createTableProperties(columns), new Progressable() { @Override public void progress() { } }); return recordWriter; }
From source file:com.facebook.presto.hive.HiveRecordSet.java
License:Apache License
private static RecordReader<?, ?> createRecordReader(HiveSplit split, Configuration configuration, Path wrappedPath) {//from w ww.j av a2 s . c om final InputFormat<?, ?> inputFormat = getInputFormat(configuration, split.getSchema(), true); final JobConf jobConf = new JobConf(configuration); final FileSplit fileSplit = createFileSplit(wrappedPath, split.getStart(), split.getLength()); // propagate serialization configuration to getRecordReader for (String name : split.getSchema().stringPropertyNames()) { if (name.startsWith("serialization.")) { jobConf.set(name, split.getSchema().getProperty(name)); } } try { return retry().stopOnIllegalExceptions().run("createRecordReader", new Callable<RecordReader<?, ?>>() { @Override public RecordReader<?, ?> call() throws IOException { return inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); } }); } catch (Exception e) { throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT.toErrorCode(), String.format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", split.getPath(), split.getStart(), split.getLength(), getInputFormatName(split.getSchema()), e.getMessage()), e); } }