Example usage for org.apache.hadoop.mapred RecordWriter write

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred RecordWriter write.

Prototype

void write(K key, V value) throws IOException;

Source Link

Document

Writes a key/value pair.

Usage

From source file:HiveKeyIgnoringBAMOutputFormat.java

License:Open Source License

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf job, Path finalOutPath,
        final Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
        Progressable progress) throws IOException {
    setSAMHeaderFrom(job);//from   w  w w . ja v a2 s. c o m

    final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job);

    final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat
            .getRecordWriter(ctx, finalOutPath);

    return new FileSinkOperator.RecordWriter() {
        @Override
        public void write(Writable rec) throws IOException {
            try {
                wrappedRecordWriter.write(null, (SAMRecordWritable) rec);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public void close(boolean abort) throws IOException {
            try {
                wrappedRecordWriter.close(ctx);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
    };
}

From source file:HiveKeyIgnoringBAMOutputFormat.java

License:Open Source License

@Override
public RecordWriter<Writable, SAMRecordWritable> getRecordWriter(FileSystem fs, JobConf job, String name,
        Progressable progress) throws IOException {
    setSAMHeaderFrom(job);//  w  ww  .j av a 2 s  . com

    final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job);

    final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat
            .getRecordWriter(ctx, FileOutputFormat.getTaskOutputPath(job, name));

    return new RecordWriter<Writable, SAMRecordWritable>() {
        @Override
        public void write(Writable ignored, SAMRecordWritable rec) throws IOException {
            try {
                wrappedRecordWriter.write(ignored, rec);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public void close(Reporter reporter) throws IOException {
            try {
                wrappedRecordWriter.close(ctx);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
    };
}

From source file:cn.spark.Case.MyMultipleOutputFormat.java

License:Apache License

/**
 * Create a composite record writer that can write key/value data to
 * different output files/*from w  w  w.  ja v a2 s .co m*/
 * 
 * @param fs
 *            the file system to use
 * @param job
 *            the job conf for the job
 * @param name
 *            the leaf file name for the output file (such as part-00000")
 * @param arg3
 *            a progressable for reporting progress.
 * @return a composite record writer
 * @throws IOException
 */
public RecordWriter<K, V> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable arg3)
        throws IOException {

    final FileSystem myFS = fs;
    final String myName = generateLeafFileName(name);
    final JobConf myJob = job;
    final Progressable myProgressable = arg3;

    return new RecordWriter<K, V>() {

        // a cache storing the record writers for different output files.
        TreeMap<String, RecordWriter<K, V>> recordWriters = new TreeMap<String, RecordWriter<K, V>>();

        public void write(K key, V value) throws IOException {

            // get the file name based on the key
            String keyBasedPath = generateFileNameForKeyValue(key, value, myName);

            // get the file name based on the input file name
            String finalPath = getInputFileBasedOutputFileName(myJob, keyBasedPath);

            // get the actual key   //??key
            K actualKey = generateActualKey(null, value);
            V actualValue = generateActualValue(key, value);

            RecordWriter<K, V> rw = this.recordWriters.get(finalPath);
            if (rw == null) {
                // if we don't have the record writer yet for the final
                // path, create
                // one
                // and add it to the cache
                rw = getBaseRecordWriter(myFS, myJob, finalPath, myProgressable);
                this.recordWriters.put(finalPath, rw);
            }
            rw.write(actualKey, actualValue);
        };

        public void close(Reporter reporter) throws IOException {
            Iterator<String> keys = this.recordWriters.keySet().iterator();
            while (keys.hasNext()) {
                RecordWriter<K, V> rw = this.recordWriters.get(keys.next());
                rw.close(reporter);
            }
            this.recordWriters.clear();
        };
    };
}

From source file:com.aliyun.openservices.tablestore.hive.TableStoreOutputFormat.java

License:Apache License

@Override
public RecordWriter<Writable, BatchWriteWritable> getRecordWriter(FileSystem ignored, JobConf job, String name,
        Progressable progress) throws IOException {
    String table = job.get(TableStoreConsts.TABLE_NAME);
    Configuration conf = translateConfig(job);
    SyncClientInterface ots = TableStore.newOtsClient(conf);
    final org.apache.hadoop.mapreduce.RecordWriter<Writable, BatchWriteWritable> writer = new TableStoreRecordWriter(
            ots, table);/*from   ww  w  .j a  va2  s  . co  m*/
    return new org.apache.hadoop.mapred.RecordWriter<Writable, BatchWriteWritable>() {
        @Override
        public void write(Writable any, BatchWriteWritable rows) throws IOException {
            try {
                writer.write(any, rows);
            } catch (InterruptedException ex) {
                throw new IOException("interrupted");
            }
        }

        @Override
        public void close(Reporter reporter) throws IOException {
            try {
                writer.close(null);
            } catch (InterruptedException ex) {
                throw new IOException("interrupted");
            }
        }
    };
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testMROutput() throws Exception {
    JobConf job = new JobConf(conf);
    Properties properties = new Properties();
    StructObjectInspector inspector;// w w w.ja va 2 s.  co  m
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    SerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "z,r");
    properties.setProperty("columns.types", "int:struct<x:int,y:int>");
    serde.initialize(conf, properties);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    conf.set("hive.io.file.readcolumn.ids", "1");
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector();
    List<? extends StructField> inFields = inner.getAllStructFieldRefs();
    IntObjectInspector intInspector = (IntObjectInspector) inFields.get(0).getFieldObjectInspector();
    while (reader.next(key, value)) {
        assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
        Object sub = inspector.getStructFieldData(value, fields.get(1));
        assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0))));
        assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1))));
        rowNum += 1;
    }
    assertEquals(3, rowNum);
    reader.close();
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testMROutput2() throws Exception {
    JobConf job = new JobConf(conf);
    // Test that you can set the output directory using this config
    job.set("mapred.work.output.dir", testFilePath.getParent().toString());
    Properties properties = new Properties();
    StructObjectInspector inspector;/*from  www.  j av  a2 s.  c o m*/
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    SerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "col");
    properties.setProperty("columns.types", "string");
    serde.initialize(conf, properties);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    reader.next(key, value);
    assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector())
            .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    reader.close();

}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

/**
 * Tests that passing null as the file system to getRecordWriter works, this is
 * to be compatible with the way Sequence and RC file tolerate nulls.
 * @throws Exception//from w  w w  .j  av a2  s.  c o m
 */
@Test
public void testNullFileSystem() throws Exception {
    conf.set("mapred.work.output.dir", testFilePath.getParent().toString());
    JobConf job = new JobConf(conf);
    Properties properties = new Properties();
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    OrcSerde serde = new OrcSerde();
    OrcOutputFormat outFormat = new OrcOutputFormat();
    RecordWriter<NullWritable, OrcSerdeRow> writer = outFormat.getRecordWriter(null, conf,
            testFilePath.getName(), Reporter.NULL);

    writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("a"), inspector));
    writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("b"), inspector));
    writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("c"), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "str,str2");
    serde.initialize(conf, properties);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    OrcInputFormat in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);

    // read the whole file
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcLazyRow> reader = in.getRecordReader(splits[0], conf,
            Reporter.NULL);
    NullWritable key = reader.createKey();
    OrcLazyRow value = (OrcLazyRow) reader.createValue();
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector();
    assertEquals(true, reader.next(key, value));
    assertEquals("a", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("b", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("c", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(false, reader.next(key, value));
    reader.close();
}

From source file:com.flaptor.hounder.crawler.Nutch9Fetcher.java

License:Apache License

/**
 * Create a nutch fetchlist segment from the provided list of pages.
 * @param fetchlist the list of pages from which to build the segment.
 *///from w w  w.j av  a  2  s .c  om
private String buildSegment(FetchList fetchlist) throws IOException {
    // create the segment dir
    String segmentDir = getNewSegmentDir();
    Path output = new Path(segmentDir, CrawlDatum.GENERATE_DIR_NAME);
    JobConf job = new JobConf();
    job.setOutputPath(output);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    // job.setOutputFormat(SequenceFileOutputFormat.class);
    // job.setOutputKeyComparatorClass(HashComparator.class);
    RecordWriter writer = new SequenceFileOutputFormat().getRecordWriter(null, job, "fetcher",
            new NoProgress());
    for (com.flaptor.hounder.crawler.pagedb.Page page : fetchlist) {
        Text key = new Text(page.getUrl());
        CrawlDatum value = new CrawlDatum(); // TODO: try taking this line outside of the loop
        writer.write(key, value);
    }
    writer.close(null);
    return segmentDir;
}

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@Override
public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector,
        Reporter reporter) throws IOException {
    String bucket = bucketId.toString();

    String dirName = bucket.substring(0, bucket.lastIndexOf('-'));

    int idx = findMatcher(dirName);

    String outputFileName = calculateOutputFile(idx, dirName);

    /*/*from w w  w .  j av a  2 s  .  co  m*/
     * Don't need to separate the paths because the output file name is already absolute.
     */
    valueOut.set(outDirPath + outputFileName);

    LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName));

    /*
     * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir.
     */
    RecordWriter<Object, Object> sink = null;
    Exception rootCause = null;

    Object key = null;
    Object value = null;

    try {
        while (null == rootCause && values.hasNext()) {
            Text srcFile = values.next();
            Path inputPath = new Path(srcFile.toString());

            RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter);

            try {
                if (null == key) {
                    key = reader.createKey();
                    value = reader.createValue();

                    /*
                     * Set the key and value class in the conf, which the output format uses to get type information.
                     */
                    job.setOutputKeyClass(key.getClass());
                    job.setOutputValueClass(value.getClass());

                    /*
                     * Output file name is absolute so we can just add it to the crush prefix.
                     */
                    sink = createRecordWriter(idx, "crush" + outputFileName);
                } else {

                    Class<?> other = reader.createKey().getClass();

                    if (!(key.getClass().equals(other))) {
                        throw new IllegalArgumentException(format("Heterogeneous keys detected in %s: %s !- %s",
                                inputPath, key.getClass(), other));
                    }

                    other = reader.createValue().getClass();

                    if (!value.getClass().equals(other)) {
                        throw new IllegalArgumentException(
                                format("Heterogeneous values detected in %s: %s !- %s", inputPath,
                                        value.getClass(), other));
                    }
                }

                while (reader.next(key, value)) {
                    sink.write(key, value);
                    reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1);
                }
            } catch (Exception e) {
                rootCause = e;
            } finally {
                try {
                    reader.close();
                } catch (Exception e) {
                    if (null == rootCause) {
                        rootCause = e;
                    } else {
                        LOG.debug("Swallowing exception on close of " + inputPath, e);
                    }
                }
            }

            /*
             * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir.
             */
            collector.collect(srcFile, valueOut);
            reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1);

            recordNumber++;

            if (reportRecordNumber == recordNumber) {
                reportRecordNumber += reportRecordNumber;

                reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath));
            }
        }
    } catch (Exception e) {
        rootCause = e;
    } finally {
        if (null != sink) {
            try {
                sink.close(reporter);
            } catch (Exception e) {
                if (null == rootCause) {
                    rootCause = e;
                } else {
                    LOG.error("Swallowing exception on close of " + outputFileName, e);
                }
            }
        }

        /*
         * Let the exception bubble up with a minimum of wrapping.
         */
        if (null != rootCause) {
            if (rootCause instanceof RuntimeException) {
                throw (RuntimeException) rootCause;
            }

            if (rootCause instanceof IOException) {
                throw (IOException) rootCause;
            }

            throw new RuntimeException(rootCause);
        }
    }
}

From source file:com.hotels.corc.mapred.CorcOutputFormatTest.java

License:Apache License

@Test
public void writer() throws IOException {
    File root = temporaryFolder.getRoot();
    conf.set("mapreduce.output.fileoutputformat.outputdir", root.getCanonicalPath());
    conf.set("mapreduce.task.attempt.id", "attempt_x_0001_m_000001_1");

    String name = "name";
    RecordWriter<NullWritable, Corc> writer = outputFormat.getRecordWriter(fileSystem, conf, name, progress);

    StructTypeInfo typeInfo = new StructTypeInfoBuilder().add("a", TypeInfoFactory.stringTypeInfo).build();

    Corc corc = new Corc(typeInfo, new DefaultConverterFactory());
    corc.set("a", "value");

    writer.write(NullWritable.get(), corc);

    writer.close(reporter);//ww w  . j  av  a  2s .  c o m

    Path path = new Path(root.getCanonicalPath() + "/_temporary/0/_temporary/attempt_x_0001_m_000001_1/name");
    try (OrcReader reader = new OrcReader(conf, path)) {
        List<Object> next = reader.next();
        assertThat(next.size(), is(1));
        assertThat(next.get(0), is((Object) "value"));
        assertFalse(reader.hasNext());
    }
    ;
}