List of usage examples for org.apache.hadoop.mapred OutputFormat getRecordWriter
RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress)
throws IOException;
From source file:cascading.hbase.HBaseTapCollector.java
License:Apache License
private void initialize() throws IOException { tap.sinkConfInit(hadoopFlowProcess, conf); OutputFormat outputFormat = conf.getOutputFormat(); LOG.info("Output format class is: " + outputFormat.getClass().toString()); writer = outputFormat.getRecordWriter(null, conf, tap.getIdentifier(), Reporter.NULL); sinkCall.setOutput(this); }
From source file:cascading.tap.hadoop.io.TapOutputCollector.java
License:Open Source License
protected void initialize() throws IOException { tap.sinkConfInit(flowProcess, conf); OutputFormat outputFormat = asJobConfInstance(conf).getOutputFormat(); // todo: use OutputCommitter class isFileOutputFormat = outputFormat instanceof FileOutputFormat; if (isFileOutputFormat) { Hadoop18TapUtil.setupJob(conf);/*from w w w. ja v a 2 s .co m*/ Hadoop18TapUtil.setupTask(conf); int partition = conf.getInt("mapred.task.partition", conf.getInt("mapreduce.task.partition", 0)); if (prefix != null) filename = String.format(filenamePattern, prefix, "/", partition, sequence); else filename = String.format(filenamePattern, "", "", partition, sequence); } LOG.info("creating path: {}", filename); writer = outputFormat.getRecordWriter(null, asJobConfInstance(conf), filename, getReporter()); }
From source file:cascading.tap.hadoop.TapCollector.java
License:Open Source License
private void initalize() throws IOException { tap.sinkInit(conf); // tap should not delete if called within a task OutputFormat outputFormat = conf.getOutputFormat(); isFileOutputFormat = outputFormat instanceof FileOutputFormat; if (isFileOutputFormat) { Hadoop18TapUtil.setupJob(conf);/* w w w .j a va2 s . c o m*/ if (prefix != null) filename = String.format(filenamePattern, prefix, "/", conf.getInt("mapred.task.partition", 0)); else filename = String.format(filenamePattern, "", "", conf.getInt("mapred.task.partition", 0)); Hadoop18TapUtil.setupTask(conf); } writer = outputFormat.getRecordWriter(null, conf, filename, Reporter.NULL); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testMROutput() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector;//from ww w .j a v a 2 s .c om synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "z,r"); properties.setProperty("columns.types", "int:struct<x:int,y:int>"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); conf.set("hive.io.file.readcolumn.ids", "1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector(); List<? extends StructField> inFields = inner.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) inFields.get(0).getFieldObjectInspector(); while (reader.next(key, value)) { assertEquals(null, inspector.getStructFieldData(value, fields.get(0))); Object sub = inspector.getStructFieldData(value, fields.get(1)); assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0)))); assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1)))); rowNum += 1; } assertEquals(3, rowNum); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testMROutput2() throws Exception { JobConf job = new JobConf(conf); // Test that you can set the output directory using this config job.set("mapred.work.output.dir", testFilePath.getParent().toString()); Properties properties = new Properties(); StructObjectInspector inspector;/*w w w . j av a 2 s.c o m*/ synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "col"); properties.setProperty("columns.types", "string"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); reader.next(key, value); assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector()) .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); reader.close(); }
From source file:com.hdfs.concat.crush.CrushReducer.java
License:Apache License
/** * Returns a record writer that creates files in the task attempt work directory. Path must be relative! *//*from w w w .j ava 2 s. c o m*/ @SuppressWarnings("unchecked") private RecordWriter<Object, Object> createRecordWriter(int idx, String path) throws IOException { Class<? extends OutputFormat<?, ?>> cls = (Class<? extends OutputFormat<?, ?>>) outFormatClsList.get(idx); try { OutputFormat<Object, Object> format = (OutputFormat<Object, Object>) cls.newInstance(); return format.getRecordWriter(fs, job, path, null); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.m6d.filecrush.crush.CrushReducer.java
License:Apache License
/** * Returns a record writer that creates files in the task attempt work directory. Path must be relative! *//*from w w w . j a va2 s . c om*/ @SuppressWarnings("unchecked") private RecordWriter<Object, Object> createRecordWriter(int idx, String path) throws IOException { Class<? extends OutputFormat<?, ?>> cls = (Class<? extends OutputFormat<?, ?>>) getOutputFormatClass(idx); try { OutputFormat<Object, Object> format = (OutputFormat<Object, Object>) cls.newInstance(); return format.getRecordWriter(fs, job, path, null); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapred.java
License:Apache License
/** * Runs mapper for the single split.// w w w . j a va 2 s. c o m * * @param mapOutputAccumulator mapOutputAccumulator to use * @param split split ot run on */ @Override @SuppressWarnings("unchecked") public void runSplit(final MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split, int splitIndex) throws IOException, ClassNotFoundException, InterruptedException { JobConf jobConf = new JobConf(this.jobConf); //Clone JobConf to prevent unexpected task interaction TaskAttemptID taskAttemptID = TaskAttemptID .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex)); ReducerWrapperMapred.updateJobConf(jobConf, taskAttemptID, splitIndex); updateJobWithSplit(jobConf, split); InputFormat inputFormat = jobConf.getInputFormat(); Reporter reporter = Reporter.NULL; //Create RecordReader org.apache.hadoop.mapred.RecordReader<INKEY, INVALUE> recordReader = inputFormat .getRecordReader((InputSplit) split, jobConf, reporter); //Make a mapper org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper; try { mapper = (org.apache.hadoop.mapred.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor .newInstance(); mapper.configure(jobConf); } catch (Exception e) { throw new RuntimeException("Cannot instantiate mapper " + mapperConstructor.getDeclaringClass(), e); } //These are to support map only jobs which write output directly to HDFS. final RecordWriter outputRecordWriter; OutputCommitter outputCommitter = null; TaskAttemptContext taskAttemptContext = null; if (mapOnlyJob) { taskAttemptContext = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID); OutputFormat outputFormat = jobConf.getOutputFormat(); FileSystem fs = FileSystem.get(jobConf); outputRecordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(fs, jobConf, ReducerWrapperMapred.getOutputName(splitIndex), Reporter.NULL); outputCommitter = jobConf.getOutputCommitter(); //Create task object so it can handle file format initialization //The MapTask is private in the Hadoop 1.x so we have to go through reflection. try { Class reduceTask = Class.forName("org.apache.hadoop.mapred.MapTask"); Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class, int.class, JobSplit.TaskSplitIndex.class, int.class); reduceTaskConstructor.setAccessible(true); Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, splitIndex, new JobSplit.TaskSplitIndex(), 0); task.setConf(jobConf); task.initialize(jobConf, jobId, Reporter.NULL, false); } catch (Exception e) { throw new IOException("Cannot initialize MapTask", e); } outputCommitter.setupTask(taskAttemptContext); } else { outputRecordWriter = null; } OutputCollector<OUTKEY, OUTVALUE> outputCollector; if (!mapOnlyJob) { outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { try { mapOutputAccumulator.combine(outkey, outvalue); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } }; } else { outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { outputRecordWriter.write(outkey, outvalue); } }; } INKEY key = recordReader.createKey(); INVALUE value = recordReader.createValue(); while (recordReader.next(key, value)) { mapper.map(key, value, outputCollector, reporter); } mapper.close(); recordReader.close(); if (mapOnlyJob) { outputRecordWriter.close(Reporter.NULL); outputCommitter.commitTask(taskAttemptContext); } }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapred.java
License:Apache License
public ReducerWrapperMapred(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId, int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException { this.invocationParameters = invocationParameters; JobConf jobConf = new JobConf((Configuration) invocationParameters.getConfiguration()); //Clone JobConf, so the temporary settings do not pollute other tasks LOG.info("Starting reducer:" + HadoopInvocationParameters.dumpConfiguration(jobConf)); JobID jobID = (JobID) invocationParameters.getJobId(); this.hadoopPartition = hadoopPartition; hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(), jobConf);/* w w w . j av a 2 s .c o m*/ TaskAttemptID taskAttemptID = TaskAttemptID .downgrade(hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition)); updateJobConf(jobConf, taskAttemptID, region); context = hadoopVersionSpecificCode.createTaskAttemptContextMapred(jobConf, taskAttemptID); reducer = (org.apache.hadoop.mapred.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(jobConf.getReducerClass(), jobConf); reducer.configure(jobConf); OutputFormat outputFormat = jobConf.getOutputFormat(); FileSystem fs = FileSystem.get(jobConf); recordWriter = (org.apache.hadoop.mapred.RecordWriter<OUTKEY, OUTVALUE>) outputFormat.getRecordWriter(fs, jobConf, getOutputName(hadoopPartition), Reporter.NULL); committer = jobConf.getOutputCommitter(); //Create task object so it can handle file format initialization //The ReduceTask is private in the Hadoop 1.x so we have to go through reflection. try { Class reduceTask = Class.forName("org.apache.hadoop.mapred.ReduceTask"); Constructor reduceTaskConstructor = reduceTask.getDeclaredConstructor(String.class, TaskAttemptID.class, int.class, int.class, int.class); reduceTaskConstructor.setAccessible(true); Task task = (Task) reduceTaskConstructor.newInstance(null, taskAttemptID, hadoopPartition, 0, 0); task.setConf(jobConf); task.initialize(jobConf, jobID, Reporter.NULL, false); } catch (Exception e) { throw new IOException("Cannot initialize ReduceTask", e); } committer.setupTask(context); Class<INKEY> keyClass = (Class<INKEY>) jobConf.getMapOutputKeyClass(); WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); Class<INVALUE> valueClass = (Class<INVALUE>) jobConf.getMapOutputValueClass(); WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>( valueClass, null); DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region, appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, jobConf) > 0, firstKeySerializer, valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass, valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, jobConf), 1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, jobConf), HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, jobConf)); transport = DataGridChunkedCollectionReader.getGridReader(params); outputCollector = new OutputCollector<OUTKEY, OUTVALUE>() { @Override public void collect(OUTKEY outkey, OUTVALUE outvalue) throws IOException { recordWriter.write(outkey, outvalue); } }; }
From source file:com.twitter.maple.jdbc.JDBCTapCollector.java
License:Apache License
private void initialize() throws IOException { tap.sinkConfInit(hadoopFlowProcess, conf); OutputFormat outputFormat = conf.getOutputFormat(); LOG.info("Output format class is: " + outputFormat.getClass().toString()); writer = outputFormat.getRecordWriter(null, conf, tap.getIdentifier(), Reporter.NULL); sinkCall.setOutput(this); }