Example usage for org.apache.hadoop.mapred RecordWriter write

List of usage examples for org.apache.hadoop.mapred RecordWriter write

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred RecordWriter write.

Prototype

void write(K key, V value) throws IOException;

Source Link

Document

Writes a key/value pair.

Usage

From source file:HiveKeyIgnoringBAMOutputFormat.java

License:Open Source License

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf job, Path finalOutPath,
        final Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
        Progressable progress) throws IOException {
    setSAMHeaderFrom(job);//from   w  w w . ja v a2 s. c o m

    final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job);

    final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat
            .getRecordWriter(ctx, finalOutPath);

    return new FileSinkOperator.RecordWriter() {
        @Override
        public void write(Writable rec) throws IOException {
            try {
                wrappedRecordWriter.write(null, (SAMRecordWritable) rec);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public void close(boolean abort) throws IOException {
            try {
                wrappedRecordWriter.close(ctx);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
    };
}

From source file:HiveKeyIgnoringBAMOutputFormat.java

License:Open Source License

@Override
public RecordWriter<Writable, SAMRecordWritable> getRecordWriter(FileSystem fs, JobConf job, String name,
        Progressable progress) throws IOException {
    setSAMHeaderFrom(job);//  w  ww  .j av a 2 s  . com

    final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job);

    final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat
            .getRecordWriter(ctx, FileOutputFormat.getTaskOutputPath(job, name));

    return new RecordWriter<Writable, SAMRecordWritable>() {
        @Override
        public void write(Writable ignored, SAMRecordWritable rec) throws IOException {
            try {
                wrappedRecordWriter.write(ignored, rec);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public void close(Reporter reporter) throws IOException {
            try {
                wrappedRecordWriter.close(ctx);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
    };
}

From source file:cn.spark.Case.MyMultipleOutputFormat.java

License:Apache License

/**
 * Create a composite record writer that can write key/value data to
 * different output files/*from w  w  w.  ja v a2 s .co m*/
 * 
 * @param fs
 *            the file system to use
 * @param job
 *            the job conf for the job
 * @param name
 *            the leaf file name for the output file (such as part-00000")
 * @param arg3
 *            a progressable for reporting progress.
 * @return a composite record writer
 * @throws IOException
 */
public RecordWriter<K, V> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable arg3)
        throws IOException {

    final FileSystem myFS = fs;
    final String myName = generateLeafFileName(name);
    final JobConf myJob = job;
    final Progressable myProgressable = arg3;

    return new RecordWriter<K, V>() {

        // a cache storing the record writers for different output files.
        TreeMap<String, RecordWriter<K, V>> recordWriters = new TreeMap<String, RecordWriter<K, V>>();

        public void write(K key, V value) throws IOException {

            // get the file name based on the key
            String keyBasedPath = generateFileNameForKeyValue(key, value, myName);

            // get the file name based on the input file name
            String finalPath = getInputFileBasedOutputFileName(myJob, keyBasedPath);

            // get the actual key   //??key
            K actualKey = generateActualKey(null, value);
            V actualValue = generateActualValue(key, value);

            RecordWriter<K, V> rw = this.recordWriters.get(finalPath);
            if (rw == null) {
                // if we don't have the record writer yet for the final
                // path, create
                // one
                // and add it to the cache
                rw = getBaseRecordWriter(myFS, myJob, finalPath, myProgressable);
                this.recordWriters.put(finalPath, rw);
            }
            rw.write(actualKey, actualValue);
        };

        public void close(Reporter reporter) throws IOException {
            Iterator<String> keys = this.recordWriters.keySet().iterator();
            while (keys.hasNext()) {
                RecordWriter<K, V> rw = this.recordWriters.get(keys.next());
                rw.close(reporter);
            }
            this.recordWriters.clear();
        };
    };
}

From source file:com.aliyun.openservices.tablestore.hive.TableStoreOutputFormat.java

License:Apache License

@Override
public RecordWriter<Writable, BatchWriteWritable> getRecordWriter(FileSystem ignored, JobConf job, String name,
        Progressable progress) throws IOException {
    String table = job.get(TableStoreConsts.TABLE_NAME);
    Configuration conf = translateConfig(job);
    SyncClientInterface ots = TableStore.newOtsClient(conf);
    final org.apache.hadoop.mapreduce.RecordWriter<Writable, BatchWriteWritable> writer = new TableStoreRecordWriter(
            ots, table);/*from   ww  w  .j a  va2  s  . co  m*/
    return new org.apache.hadoop.mapred.RecordWriter<Writable, BatchWriteWritable>() {
        @Override
        public void write(Writable any, BatchWriteWritable rows) throws IOException {
            try {
                writer.write(any, rows);
            } catch (InterruptedException ex) {
                throw new IOException("interrupted");
            }
        }

        @Override
        public void close(Reporter reporter) throws IOException {
            try {
                writer.close(null);
            } catch (InterruptedException ex) {
                throw new IOException("interrupted");
            }
        }
    };
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testMROutput() throws Exception {
    JobConf job = new JobConf(conf);
    Properties properties = new Properties();
    StructObjectInspector inspector;// w w w.ja va 2 s.  co  m
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    SerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "z,r");
    properties.setProperty("columns.types", "int:struct<x:int,y:int>");
    serde.initialize(conf, properties);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    conf.set("hive.io.file.readcolumn.ids", "1");
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector();
    List<? extends StructField> inFields = inner.getAllStructFieldRefs();
    IntObjectInspector intInspector = (IntObjectInspector) inFields.get(0).getFieldObjectInspector();
    while (reader.next(key, value)) {
        assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
        Object sub = inspector.getStructFieldData(value, fields.get(1));
        assertEquals(3 * rowNum + 1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0))));
        assertEquals(3 * rowNum + 2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1))));
        rowNum += 1;
    }
    assertEquals(3, rowNum);
    reader.close();
}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

@Test
public void testMROutput2() throws Exception {
    JobConf job = new JobConf(conf);
    // Test that you can set the output directory using this config
    job.set("mapred.work.output.dir", testFilePath.getParent().toString());
    Properties properties = new Properties();
    StructObjectInspector inspector;/*from  www.  j av  a2 s.  c o m*/
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    SerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    RecordWriter writer = outFormat.getRecordWriter(fs, job, testFilePath.getName(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new StringRow("a"), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "col");
    properties.setProperty("columns.types", "string");
    serde.initialize(conf, properties);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
    Object key = reader.createKey();
    Object value = reader.createValue();
    int rowNum = 0;
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    reader.next(key, value);
    assertEquals("a", ((StringObjectInspector) fields.get(0).getFieldObjectInspector())
            .getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    reader.close();

}

From source file:com.facebook.hive.orc.TestInputOutputFormat.java

License:Apache License

/**
 * Tests that passing null as the file system to getRecordWriter works, this is
 * to be compatible with the way Sequence and RC file tolerate nulls.
 * @throws Exception//from w  w w  .j  av a2  s.  c o m
 */
@Test
public void testNullFileSystem() throws Exception {
    conf.set("mapred.work.output.dir", testFilePath.getParent().toString());
    JobConf job = new JobConf(conf);
    Properties properties = new Properties();
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    OrcSerde serde = new OrcSerde();
    OrcOutputFormat outFormat = new OrcOutputFormat();
    RecordWriter<NullWritable, OrcSerdeRow> writer = outFormat.getRecordWriter(null, conf,
            testFilePath.getName(), Reporter.NULL);

    writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("a"), inspector));
    writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("b"), inspector));
    writer.write(NullWritable.get(), (OrcSerdeRow) serde.serialize(new StringRow("c"), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    properties.setProperty("columns", "str,str2");
    serde.initialize(conf, properties);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    OrcInputFormat in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);

    // read the whole file
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcLazyRow> reader = in.getRecordReader(splits[0], conf,
            Reporter.NULL);
    NullWritable key = reader.createKey();
    OrcLazyRow value = (OrcLazyRow) reader.createValue();
    List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector();
    assertEquals(true, reader.next(key, value));
    assertEquals("a", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("b", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(true, reader.next(key, value));
    assertEquals("c", strInspector.getPrimitiveJavaObject(inspector.getStructFieldData(value, fields.get(0))));
    assertEquals(false, reader.next(key, value));
    reader.close();
}

From source file:com.flaptor.hounder.crawler.Nutch9Fetcher.java

License:Apache License

/**
 * Create a nutch fetchlist segment from the provided list of pages.
 * @param fetchlist the list of pages from which to build the segment.
 *///from w w  w.j av  a  2  s .c  om
private String buildSegment(FetchList fetchlist) throws IOException {
    // create the segment dir
    String segmentDir = getNewSegmentDir();
    Path output = new Path(segmentDir, CrawlDatum.GENERATE_DIR_NAME);
    JobConf job = new JobConf();
    job.setOutputPath(output);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    // job.setOutputFormat(SequenceFileOutputFormat.class);
    // job.setOutputKeyComparatorClass(HashComparator.class);
    RecordWriter writer = new SequenceFileOutputFormat().getRecordWriter(null, job, "fetcher",
            new NoProgress());
    for (com.flaptor.hounder.crawler.pagedb.Page page : fetchlist) {
        Text key = new Text(page.getUrl());
        CrawlDatum value = new CrawlDatum(); // TODO: try taking this line outside of the loop
        writer.write(key, value);
    }
    writer.close(null);
    return segmentDir;
}

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@Override
public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector,
        Reporter reporter) throws IOException {
    String bucket = bucketId.toString();

    String dirName = bucket.substring(0, bucket.lastIndexOf('-'));

    int idx = findMatcher(dirName);

    String outputFileName = calculateOutputFile(idx, dirName);

    /*/*from w w  w .  j av a  2 s  .  co  m*/
     * Don't need to separate the paths because the output file name is already absolute.
     */
    valueOut.set(outDirPath + outputFileName);

    LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName));

    /*
     * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir.
     */
    RecordWriter<Object, Object> sink = null;
    Exception rootCause = null;

    Object key = null;
    Object value = null;

    try {
        while (null == rootCause && values.hasNext()) {
            Text srcFile = values.next();
            Path inputPath = new Path(srcFile.toString());

            RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter);

            try {
                if (null == key) {
                    key = reader.createKey();
                    value = reader.createValue();

                    /*
                     * Set the key and value class in the conf, which the output format uses to get type information.
                     */
                    job.setOutputKeyClass(key.getClass());
                    job.setOutputValueClass(value.getClass());

                    /*
                     * Output file name is absolute so we can just add it to the crush prefix.
                     */
                    sink = createRecordWriter(idx, "crush" + outputFileName);
                } else {

                    Class<?> other = reader.createKey().getClass();

                    if (!(key.getClass().equals(other))) {
                        throw new IllegalArgumentException(format("Heterogeneous keys detected in %s: %s !- %s",
                                inputPath, key.getClass(), other));
                    }

                    other = reader.createValue().getClass();

                    if (!value.getClass().equals(other)) {
                        throw new IllegalArgumentException(
                                format("Heterogeneous values detected in %s: %s !- %s", inputPath,
                                        value.getClass(), other));
                    }
                }

                while (reader.next(key, value)) {
                    sink.write(key, value);
                    reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1);
                }
            } catch (Exception e) {
                rootCause = e;
            } finally {
                try {
                    reader.close();
                } catch (Exception e) {
                    if (null == rootCause) {
                        rootCause = e;
                    } else {
                        LOG.debug("Swallowing exception on close of " + inputPath, e);
                    }
                }
            }

            /*
             * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir.
             */
            collector.collect(srcFile, valueOut);
            reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1);

            recordNumber++;

            if (reportRecordNumber == recordNumber) {
                reportRecordNumber += reportRecordNumber;

                reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath));
            }
        }
    } catch (Exception e) {
        rootCause = e;
    } finally {
        if (null != sink) {
            try {
                sink.close(reporter);
            } catch (Exception e) {
                if (null == rootCause) {
                    rootCause = e;
                } else {
                    LOG.error("Swallowing exception on close of " + outputFileName, e);
                }
            }
        }

        /*
         * Let the exception bubble up with a minimum of wrapping.
         */
        if (null != rootCause) {
            if (rootCause instanceof RuntimeException) {
                throw (RuntimeException) rootCause;
            }

            if (rootCause instanceof IOException) {
                throw (IOException) rootCause;
            }

            throw new RuntimeException(rootCause);
        }
    }
}

From source file:com.hotels.corc.mapred.CorcOutputFormatTest.java

License:Apache License

@Test
public void writer() throws IOException {
    File root = temporaryFolder.getRoot();
    conf.set("mapreduce.output.fileoutputformat.outputdir", root.getCanonicalPath());
    conf.set("mapreduce.task.attempt.id", "attempt_x_0001_m_000001_1");

    String name = "name";
    RecordWriter<NullWritable, Corc> writer = outputFormat.getRecordWriter(fileSystem, conf, name, progress);

    StructTypeInfo typeInfo = new StructTypeInfoBuilder().add("a", TypeInfoFactory.stringTypeInfo).build();

    Corc corc = new Corc(typeInfo, new DefaultConverterFactory());
    corc.set("a", "value");

    writer.write(NullWritable.get(), corc);

    writer.close(reporter);//ww w  . j  av  a  2s .  c o m

    Path path = new Path(root.getCanonicalPath() + "/_temporary/0/_temporary/attempt_x_0001_m_000001_1/name");
    try (OrcReader reader = new OrcReader(conf, path)) {
        List<Object> next = reader.next();
        assertThat(next.size(), is(1));
        assertThat(next.get(0), is((Object) "value"));
        assertFalse(reader.hasNext());
    }
    ;
}