List of usage examples for org.apache.hadoop.mapred RecordWriter write
void write(K key, V value) throws IOException;
From source file:HiveKeyIgnoringBAMOutputFormat.java
License:Open Source License
@Override public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf job, Path finalOutPath, final Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException { setSAMHeaderFrom(job);//from w w w . ja v a2 s. c o m final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job); final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat .getRecordWriter(ctx, finalOutPath); return new FileSinkOperator.RecordWriter() { @Override public void write(Writable rec) throws IOException { try { wrappedRecordWriter.write(null, (SAMRecordWritable) rec); } catch (InterruptedException e) { throw new RuntimeException(e); } } @Override public void close(boolean abort) throws IOException { try { wrappedRecordWriter.close(ctx); } catch (InterruptedException e) { throw new RuntimeException(e); } } }; }
From source file:HiveKeyIgnoringBAMOutputFormat.java
License:Open Source License
@Override public RecordWriter<Writable, SAMRecordWritable> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { setSAMHeaderFrom(job);// w ww .j av a 2 s . com final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job); final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat .getRecordWriter(ctx, FileOutputFormat.getTaskOutputPath(job, name)); return new RecordWriter<Writable, SAMRecordWritable>() { @Override public void write(Writable ignored, SAMRecordWritable rec) throws IOException { try { wrappedRecordWriter.write(ignored, rec); } catch (InterruptedException e) { throw new RuntimeException(e); } } @Override public void close(Reporter reporter) throws IOException { try { wrappedRecordWriter.close(ctx); } catch (InterruptedException e) { throw new RuntimeException(e); } } }; }
From source file:cn.spark.Case.MyMultipleOutputFormat.java
License:Apache License
/** * Create a composite record writer that can write key/value data to * different output files/*from w w w. ja v a2 s .co m*/ * * @param fs * the file system to use * @param job * the job conf for the job * @param name * the leaf file name for the output file (such as part-00000") * @param arg3 * a progressable for reporting progress. * @return a composite record writer * @throws IOException */ public RecordWriter<K, V> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable arg3) throws IOException { final FileSystem myFS = fs; final String myName = generateLeafFileName(name); final JobConf myJob = job; final Progressable myProgressable = arg3; return new RecordWriter<K, V>() { // a cache storing the record writers for different output files. TreeMap<String, RecordWriter<K, V>> recordWriters = new TreeMap<String, RecordWriter<K, V>>(); public void write(K key, V value) throws IOException { // get the file name based on the key String keyBasedPath = generateFileNameForKeyValue(key, value, myName); // get the file name based on the input file name String finalPath = getInputFileBasedOutputFileName(myJob, keyBasedPath); // get the actual key //??key K actualKey = generateActualKey(null, value); V actualValue = generateActualValue(key, value); RecordWriter<K, V> rw = this.recordWriters.get(finalPath); if (rw == null) { // if we don't have the record writer yet for the final // path, create // one // and add it to the cache rw = getBaseRecordWriter(myFS, myJob, finalPath, myProgressable); this.recordWriters.put(finalPath, rw); } rw.write(actualKey, actualValue); }; public void close(Reporter reporter) throws IOException { Iterator<String> keys = this.recordWriters.keySet().iterator(); while (keys.hasNext()) { RecordWriter<K, V> rw = this.recordWriters.get(keys.next()); rw.close(reporter); } this.recordWriters.clear(); }; }; }
From source file:com.aliyun.openservices.tablestore.hive.TableStoreOutputFormat.java
License:Apache License
@Override public RecordWriter<Writable, BatchWriteWritable> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { String table = job.get(TableStoreConsts.TABLE_NAME); Configuration conf = translateConfig(job); SyncClientInterface ots = TableStore.newOtsClient(conf); final org.apache.hadoop.mapreduce.RecordWriter<Writable, BatchWriteWritable> writer = new TableStoreRecordWriter( ots, table);/*from ww w .j a va2 s . co m*/ return new org.apache.hadoop.mapred.RecordWriter<Writable, BatchWriteWritable>() { @Override public void write(Writable any, BatchWriteWritable rows) throws IOException { try { writer.write(any, rows); } catch (InterruptedException ex) { throw new IOException("interrupted"); } } @Override public void close(Reporter reporter) throws IOException { try { writer.close(null); } catch (InterruptedException ex) { throw new IOException("interrupted"); } } }; }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testMROutput() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector;// w w w.ja va 2 s. co m synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "z,r"); properties.setProperty("columns.types", "int:struct<x:int,y:int>"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); conf.set("hive.io.file.readcolumn.ids", "1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector(); List<? extends StructField> inFields = inner.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) inFields.get(0).getFieldObjectInspector(); while (reader.next(key, value)) { assertEquals(null, inspector.getStructFieldData(value, fields.get(0))); Object sub = inspector.getStructFieldData(value, fields.get(1)); assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0)))); assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1)))); rowNum += 1; } assertEquals(3, rowNum); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testMROutput2() throws Exception { JobConf job = new JobConf(conf); // Test that you can set the output directory using this config job.set("mapred.work.output.dir", testFilePath.getParent().toString()); Properties properties = new Properties(); StructObjectInspector inspector;/*from www. j av a2 s. c o m*/ synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "col"); properties.setProperty("columns.types", "string"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); reader.next(key, value); assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector()) .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
/** * Tests that passing null as the file system to getRecordWriter works, this is * to be compatible with the way Sequence and RC file tolerate nulls. * @throws Exception//from w w w .j av a2 s. c o m */ @Test public void testNullFileSystem() throws Exception { conf.set("mapred.work.output.dir", testFilePath.getParent().toString()); JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } OrcSerde serde = new OrcSerde(); OrcOutputFormat outFormat = new OrcOutputFormat(); RecordWriter<NullWritable, OrcSerdeRow> writer = outFormat.getRecordWriter(null, conf, testFilePath.getName(), Reporter.NULL); writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("a"), inspector)); writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("b"), inspector)); writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("c"), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "str,str2"); serde.initialize(conf, properties); inspector = (StructObjectInspector) serde.getObjectInspector(); OrcInputFormat in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file org.apache.hadoop.mapred.RecordReader<NullWritable, OrcLazyRow> reader = in.getRecordReader(splits[0], conf, Reporter.NULL); NullWritable key = reader.createKey(); OrcLazyRow value = (OrcLazyRow) reader.createValue(); List<? extends StructField> fields = inspector.getAllStructFieldRefs(); StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(true, reader.next(key, value)); assertEquals("a", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("b", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("c", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(false, reader.next(key, value)); reader.close(); }
From source file:com.flaptor.hounder.crawler.Nutch9Fetcher.java
License:Apache License
/** * Create a nutch fetchlist segment from the provided list of pages. * @param fetchlist the list of pages from which to build the segment. *///from w w w.j av a 2 s .c om private String buildSegment(FetchList fetchlist) throws IOException { // create the segment dir String segmentDir = getNewSegmentDir(); Path output = new Path(segmentDir, CrawlDatum.GENERATE_DIR_NAME); JobConf job = new JobConf(); job.setOutputPath(output); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); // job.setOutputFormat(SequenceFileOutputFormat.class); // job.setOutputKeyComparatorClass(HashComparator.class); RecordWriter writer = new SequenceFileOutputFormat().getRecordWriter(null, job, "fetcher", new NoProgress()); for (com.flaptor.hounder.crawler.pagedb.Page page : fetchlist) { Text key = new Text(page.getUrl()); CrawlDatum value = new CrawlDatum(); // TODO: try taking this line outside of the loop writer.write(key, value); } writer.close(null); return segmentDir; }
From source file:com.hdfs.concat.crush.CrushReducer.java
License:Apache License
@Override public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException { String bucket = bucketId.toString(); String dirName = bucket.substring(0, bucket.lastIndexOf('-')); int idx = findMatcher(dirName); String outputFileName = calculateOutputFile(idx, dirName); /*/*from w w w . j av a 2 s . co m*/ * Don't need to separate the paths because the output file name is already absolute. */ valueOut.set(outDirPath + outputFileName); LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName)); /* * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir. */ RecordWriter<Object, Object> sink = null; Exception rootCause = null; Object key = null; Object value = null; try { while (null == rootCause && values.hasNext()) { Text srcFile = values.next(); Path inputPath = new Path(srcFile.toString()); RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter); try { if (null == key) { key = reader.createKey(); value = reader.createValue(); /* * Set the key and value class in the conf, which the output format uses to get type information. */ job.setOutputKeyClass(key.getClass()); job.setOutputValueClass(value.getClass()); /* * Output file name is absolute so we can just add it to the crush prefix. */ sink = createRecordWriter(idx, "crush" + outputFileName); } else { Class<?> other = reader.createKey().getClass(); if (!(key.getClass().equals(other))) { throw new IllegalArgumentException(format("Heterogeneous keys detected in %s: %s !- %s", inputPath, key.getClass(), other)); } other = reader.createValue().getClass(); if (!value.getClass().equals(other)) { throw new IllegalArgumentException( format("Heterogeneous values detected in %s: %s !- %s", inputPath, value.getClass(), other)); } } while (reader.next(key, value)) { sink.write(key, value); reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1); } } catch (Exception e) { rootCause = e; } finally { try { reader.close(); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.debug("Swallowing exception on close of " + inputPath, e); } } } /* * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir. */ collector.collect(srcFile, valueOut); reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1); recordNumber++; if (reportRecordNumber == recordNumber) { reportRecordNumber += reportRecordNumber; reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath)); } } } catch (Exception e) { rootCause = e; } finally { if (null != sink) { try { sink.close(reporter); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.error("Swallowing exception on close of " + outputFileName, e); } } } /* * Let the exception bubble up with a minimum of wrapping. */ if (null != rootCause) { if (rootCause instanceof RuntimeException) { throw (RuntimeException) rootCause; } if (rootCause instanceof IOException) { throw (IOException) rootCause; } throw new RuntimeException(rootCause); } } }
From source file:com.hotels.corc.mapred.CorcOutputFormatTest.java
License:Apache License
@Test public void writer() throws IOException { File root = temporaryFolder.getRoot(); conf.set("mapreduce.output.fileoutputformat.outputdir", root.getCanonicalPath()); conf.set("mapreduce.task.attempt.id", "attempt_x_0001_m_000001_1"); String name = "name"; RecordWriter<NullWritable, Corc> writer = outputFormat.getRecordWriter(fileSystem, conf, name, progress); StructTypeInfo typeInfo = new StructTypeInfoBuilder().add("a", TypeInfoFactory.stringTypeInfo).build(); Corc corc = new Corc(typeInfo, new DefaultConverterFactory()); corc.set("a", "value"); writer.write(NullWritable.get(), corc); writer.close(reporter);//ww w . j av a 2s . c o m Path path = new Path(root.getCanonicalPath() + "/_temporary/0/_temporary/attempt_x_0001_m_000001_1/name"); try (OrcReader reader = new OrcReader(conf, path)) { List<Object> next = reader.next(); assertThat(next.size(), is(1)); assertThat(next.get(0), is((Object) "value")); assertFalse(reader.hasNext()); } ; }