List of usage examples for org.apache.hadoop.mapreduce RecordWriter close
public abstract void close(TaskAttemptContext context) throws IOException, InterruptedException;
RecordWriter
to future operations. From source file:cz.seznam.euphoria.hadoop.output.TestDataSinkOutputFormat.java
License:Apache License
@Test @SuppressWarnings("unchecked") /**// www. ja v a 2 s.co m * Test that {@code ListDataSink} can be used in place of hadoop {@code OutputFormat}. **/ public void testDataSink() throws Exception { DummySink sink = new DummySink(); Configuration conf = new Configuration(); DataSinkOutputFormat.configure(conf, sink); // mock the instances we will need TaskAttemptContext first = mockContext(conf, 0); TaskAttemptContext second = mockContext(conf, 1); // instantiate the output format DataSinkOutputFormat<Long> format = DataSinkOutputFormat.class.newInstance(); // validate format.checkOutputSpecs(first); // create record writer for the first partition RecordWriter<NullWritable, Long> writer = format.getRecordWriter(first); writer.write(NullWritable.get(), 2L); writer.close(first); format.getOutputCommitter(first).commitTask(first); // now the second partition, we need to create new instance of output format format = DataSinkOutputFormat.class.newInstance(); // validate format.checkOutputSpecs(second); // create record writer for the second partition writer = format.getRecordWriter(second); writer.write(NullWritable.get(), 4L); writer.close(second); OutputCommitter committer = format.getOutputCommitter(second); committer.commitTask(second); // and now validate what was written assertFalse(DummySink.isCommitted); committer.commitJob(second); assertTrue(DummySink.isCommitted); assertTrue(DummySink.outputs.isEmpty()); assertEquals(2, DummySink.committed.size()); assertEquals(Arrays.asList(2L), DummySink.committed.get(0)); assertEquals(Arrays.asList(4L), DummySink.committed.get(1)); }
From source file:eu.stratosphere.addons.hbase.GenericTableOutputFormat.java
License:Apache License
@Override public void close() throws IOException { final RecordWriter<ImmutableBytesWritable, KeyValue> writer = this.writer; this.writer = null; if (writer != null) { try {/* w w w . ja v a2 s . c om*/ writer.close(this.context); } catch (InterruptedException iex) { throw new IOException("Closing was interrupted.", iex); } } }
From source file:gr.ntua.h2rdf.inputFormat.MultiHFileOutputFormat.java
License:Open Source License
public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { return new RecordWriter<ImmutableBytesWritable, KeyValue>() { @Override/* w ww . jav a2s . c o m*/ public void close(TaskAttemptContext context) throws IOException, InterruptedException { for (RecordWriter<ImmutableBytesWritable, KeyValue> writer : writers.values()) { writer.close(context); } } @Override public void write(ImmutableBytesWritable key, KeyValue value) throws IOException, InterruptedException { RecordWriter<ImmutableBytesWritable, KeyValue> writer = writers.get(key); if (writer == null) { final Path outputPath = new Path( FileOutputFormat.getOutputPath(context).toString() + "/" + Bytes.toString(key.get())); writer = new RecordWriter<ImmutableBytesWritable, KeyValue>() { final FileOutputCommitter committer = new FileOutputCommitter(outputPath, context); final Path outputdir = committer.getWorkPath(); final Configuration conf = context.getConfiguration(); final FileSystem fs = outputdir.getFileSystem(conf); final long maxsize = conf.getLong("hbase.hregion.max.filesize", HConstants.DEFAULT_MAX_FILE_SIZE); final int blocksize = conf.getInt("hfile.min.blocksize.size", HFile.DEFAULT_BLOCKSIZE); // Invented config. Add to hbase-*.xml if other than default compression. final String compression = conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); // Map of families to writers and how much has been output on the writer. final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>( Bytes.BYTES_COMPARATOR); byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY; final byte[] now = Bytes.toBytes(System.currentTimeMillis()); boolean rollRequested = false; public void write(ImmutableBytesWritable row, KeyValue kv) throws IOException { // null input == user explicitly wants to flush if (row == null && kv == null) { rollWriters(); return; } byte[] rowKey = kv.getRow(); long length = kv.getLength(); byte[] family = kv.getFamily(); WriterLength wl = this.writers.get(family); // If this is a new column family, verify that the directory exists if (wl == null) { fs.mkdirs(new Path(outputdir, Bytes.toString(family))); } // If any of the HFiles for the column families has reached // maxsize, we need to roll all the writers if (wl != null && wl.written + length >= maxsize) { this.rollRequested = true; } // This can only happen once a row is finished though if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) { rollWriters(); } // create a new HLog writer, if necessary if (wl == null || wl.writer == null) { wl = getNewWriter(family); } // we now have the proper HLog writer. full steam ahead kv.updateLatestStamp(this.now); wl.writer.append(kv); wl.written += length; // Copy the row so we know when a row transition. this.previousRow = rowKey; } private void rollWriters() throws IOException { for (WriterLength wl : this.writers.values()) { if (wl.writer != null) { close(wl.writer); } wl.writer = null; wl.written = 0; } this.rollRequested = false; } private HFile.Writer getNewWriter(final HFile.Writer writer, final Path familydir, Configuration conf) throws IOException { if (writer != null) { close(writer); } return HFile.getWriterFactoryNoCache(conf).create(); //return HFile.getWriterFactory(conf).createWriter(fs, StoreFile.getUniqueFile(fs, familydir), // blocksize, compression, KeyValue.KEY_COMPARATOR); // return new HFile.Writer(fs, StoreFile.getUniqueFile(fs, familydir), // blocksize, compression, KeyValue.KEY_COMPARATOR); } private WriterLength getNewWriter(byte[] family) throws IOException { WriterLength wl = new WriterLength(); Path familydir = new Path(outputdir, Bytes.toString(family)); wl.writer = getNewWriter(wl.writer, familydir, conf); this.writers.put(family, wl); return wl; } private void close(final HFile.Writer w) throws IOException { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis())); w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); w.close(); } } public void close(TaskAttemptContext c) throws IOException, InterruptedException { for (WriterLength wl : this.writers.values()) { close(wl.writer); } committer.commitTask(c); } }; writers.put(key, writer); } writer.write(new ImmutableBytesWritable(value.getRow()), value); } }; }
From source file:input_format.MultiHFileOutputFormat.java
License:Open Source License
public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { return new RecordWriter<ImmutableBytesWritable, KeyValue>() { @Override//w w w . j av a2s. c o m public void close(TaskAttemptContext context) throws IOException, InterruptedException { for (RecordWriter<ImmutableBytesWritable, KeyValue> writer : writers.values()) { writer.close(context); } } @Override public void write(ImmutableBytesWritable key, KeyValue value) throws IOException, InterruptedException { RecordWriter<ImmutableBytesWritable, KeyValue> writer = writers.get(key); if (writer == null) { final Path outputPath = new Path( FileOutputFormat.getOutputPath(context).toString() + "/" + Bytes.toString(key.get())); writer = new RecordWriter<ImmutableBytesWritable, KeyValue>() { final FileOutputCommitter committer = new FileOutputCommitter(outputPath, context); final Path outputdir = committer.getWorkPath(); final Configuration conf = context.getConfiguration(); final FileSystem fs = outputdir.getFileSystem(conf); final long maxsize = conf.getLong("hbase.hregion.max.filesize", HConstants.DEFAULT_MAX_FILE_SIZE); final int blocksize = conf.getInt("hfile.min.blocksize.size", HFile.DEFAULT_BLOCKSIZE); // Invented config. Add to hbase-*.xml if other than default compression. final String compression = conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); // Map of families to writers and how much has been output on the writer. final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>( Bytes.BYTES_COMPARATOR); byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY; final byte[] now = Bytes.toBytes(System.currentTimeMillis()); boolean rollRequested = false; public void write(ImmutableBytesWritable row, KeyValue kv) throws IOException { // null input == user explicitly wants to flush if (row == null && kv == null) { rollWriters(); return; } byte[] rowKey = kv.getRow(); long length = kv.getLength(); byte[] family = kv.getFamily(); WriterLength wl = this.writers.get(family); // If this is a new column family, verify that the directory exists if (wl == null) { fs.mkdirs(new Path(outputdir, Bytes.toString(family))); } // If any of the HFiles for the column families has reached // maxsize, we need to roll all the writers if (wl != null && wl.written + length >= maxsize) { this.rollRequested = true; } // This can only happen once a row is finished though if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) { rollWriters(); } // create a new HLog writer, if necessary if (wl == null || wl.writer == null) { wl = getNewWriter(family); } // we now have the proper HLog writer. full steam ahead kv.updateLatestStamp(this.now); wl.writer.append(kv); wl.written += length; // Copy the row so we know when a row transition. this.previousRow = rowKey; } private void rollWriters() throws IOException { for (WriterLength wl : this.writers.values()) { if (wl.writer != null) { close(wl.writer); } wl.writer = null; wl.written = 0; } this.rollRequested = false; } private HFile.Writer getNewWriter(final HFile.Writer writer, final Path familydir, Configuration conf) throws IOException { if (writer != null) { close(writer); } return HFile.getWriterFactory(conf).createWriter(fs, StoreFile.getUniqueFile(fs, familydir), blocksize, compression, KeyValue.KEY_COMPARATOR); // return new HFile.Writer(fs, StoreFile.getUniqueFile(fs, familydir), // blocksize, compression, KeyValue.KEY_COMPARATOR); } private WriterLength getNewWriter(byte[] family) throws IOException { WriterLength wl = new WriterLength(); Path familydir = new Path(outputdir, Bytes.toString(family)); wl.writer = getNewWriter(wl.writer, familydir, conf); this.writers.put(family, wl); return wl; } private void close(final HFile.Writer w) throws IOException { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis())); w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); w.close(); } } public void close(TaskAttemptContext c) throws IOException, InterruptedException { for (WriterLength wl : this.writers.values()) { close(wl.writer); } committer.commitTask(c); } }; writers.put(key, writer); } writer.write(new ImmutableBytesWritable(value.getRow()), value); } }; }
From source file:io.amient.kafka.hadoop.io.MultiOutputFormat.java
License:Apache License
public RecordWriter<MsgMetadataWritable, BytesWritable> getRecordWriter(TaskAttemptContext context) throws IOException { final TaskAttemptContext taskContext = context; final Configuration conf = context.getConfiguration(); final boolean isCompressed = getCompressOutput(context); String ext = ""; CompressionCodec gzipCodec = null;/* w ww . j a v a2 s . co m*/ if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class); gzipCodec = ReflectionUtils.newInstance(codecClass, conf); ext = ".gz"; } final CompressionCodec codec = gzipCodec; final String extension = ext; final String pathFormat = conf.get(CONFIG_PATH_FORMAT, "'{T}/{P}'"); log.info("Using path format: " + pathFormat); final SimpleDateFormat timeFormat = new SimpleDateFormat(pathFormat); timeFormat.setTimeZone(TimeZone.getTimeZone("UTC")); final DecimalFormat offsetFormat = new DecimalFormat("0000000000000000000"); final boolean hasTS = HadoopJobMapper.isTimestampExtractorConfigured(conf); return new RecordWriter<MsgMetadataWritable, BytesWritable>() { TreeMap<String, RecordWriter<Void, BytesWritable>> recordWriters = new TreeMap<>(); Path prefixPath = ((FileOutputCommitter) getOutputCommitter(taskContext)).getWorkPath(); public void write(MsgMetadataWritable key, BytesWritable value) throws IOException { if (hasTS && key.getTimestamp() == null) { //extractor didn't wish to throw exception so skipping this record return; } String P = String.valueOf(key.getSplit().getPartition()); String T = key.getSplit().getTopic(); String suffixPath = hasTS ? timeFormat.format(key.getTimestamp()) : pathFormat.replaceAll("'", ""); suffixPath = suffixPath.replace("{T}", T); suffixPath = suffixPath.replace("{P}", P); suffixPath += "/" + T + "-" + P + "-" + offsetFormat.format(key.getSplit().getStartOffset()); suffixPath += extension; RecordWriter<Void, BytesWritable> rw = this.recordWriters.get(suffixPath); try { if (rw == null) { Path file = new Path(prefixPath, suffixPath); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); if (isCompressed) { rw = new LineRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut))); } else { rw = new LineRecordWriter(fileOut); } this.recordWriters.put(suffixPath, rw); } rw.write(null, value); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { Iterator<String> keys = this.recordWriters.keySet().iterator(); while (keys.hasNext()) { RecordWriter<Void, BytesWritable> rw = this.recordWriters.get(keys.next()); rw.close(context); } this.recordWriters.clear(); } }; }
From source file:io.fluo.mapreduce.FluoFileOutputFormat.java
License:Apache License
@Override public RecordWriter<RowColumn, Bytes> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { final RecordWriter<Key, Value> accumuloRecordWriter = new AccumuloFileOutputFormat().getRecordWriter(job); return new RecordWriter<RowColumn, Bytes>() { @Override/* ww w. j a va2 s . co m*/ public void write(RowColumn key, Bytes value) throws IOException, InterruptedException { Text row = ByteUtil.toText(key.getRow()); Text fam = ByteUtil.toText(key.getColumn().getFamily()); Text qual = ByteUtil.toText(key.getColumn().getQualifier()); Text vis = ByteUtil.toText(key.getColumn().getVisibility()); Key dataKey = new Key(row, fam, qual, vis, ColumnConstants.DATA_PREFIX | 0); Key writeKey = new Key(row, fam, qual, vis, ColumnConstants.WRITE_PREFIX | 1); accumuloRecordWriter.write(writeKey, new Value(WriteValue.encode(0, false, false))); accumuloRecordWriter.write(dataKey, new Value(value.toArray())); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { accumuloRecordWriter.close(context); } }; }
From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java
License:Open Source License
public static void main(String[] args) throws IOException, InterruptedException { if (args.length != 2) { System.out.println("Usage: <input folder> <output file>"); System.exit(-1);/*w ww . ja va 2 s . c o m*/ } String inputPath = args[0]; String outputFile = args[1]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() { @Override public boolean accept(Path path) { return path.toString().endsWith(".parquet"); } }); Path output = new Path(outputFile); fs.delete(output, true); ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>(); inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class); Job job = new Job(conf); ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>( ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ProtoParquetOutputFormat.setEnableDictionary(job, true); RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output, CompressionCodecName.SNAPPY); List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); for (FileStatus fileStatus : input) { System.out.println(fileStatus.getPath().toString()); splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus))); } int splitIndex = 0; for (ParquetInputSplit split : splits) { System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of " + splits.size() + ")"); TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex), splitIndex); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split, ctx); reader.initialize(split, ctx); while (reader.nextKeyValue()) { ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue(); ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder(); builder.setUrl(record.getUrl()); builder.setArchiveTime(record.getArchiveTime()); builder.addAllScripts(record.getScriptsList()); builder.addAllIframes(record.getIframesList()); builder.addAllLinks(record.getLinksList()); builder.addAllImages(record.getImagesList()); recordWriter.write(null, builder.build()); } if (reader != null) { reader.close(); } splitIndex++; } TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); if (recordWriter != null) { recordWriter.close(ctx); } }
From source file:org.apache.accumulo.examples.simple.helloworld.InsertWithOutputFormat.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 5) { System.out.println("Usage: bin/tool.sh " + this.getClass().getName() + " <instance name> <zoo keepers> <username> <password> <tablename>"); return 1; }//from w w w . ja v a2 s. co m Text tableName = new Text(args[4]); Job job = new Job(getConf()); Configuration conf = job.getConfiguration(); AccumuloOutputFormat.setZooKeeperInstance(conf, args[0], args[1]); AccumuloOutputFormat.setOutputInfo(conf, args[2], args[3].getBytes(), true, null); job.setOutputFormatClass(AccumuloOutputFormat.class); // when running a mapreduce, you won't need to instantiate the output // format and record writer // mapreduce will do that for you, and you will just use // output.collect(tableName, mutation) TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID()); RecordWriter<Text, Mutation> rw = new AccumuloOutputFormat().getRecordWriter(context); Text colf = new Text("colfam"); System.out.println("writing ..."); for (int i = 0; i < 10000; i++) { Mutation m = new Mutation(new Text(String.format("row_%d", i))); for (int j = 0; j < 5; j++) { m.put(colf, new Text(String.format("colqual_%d", j)), new Value((String.format("value_%d_%d", i, j)).getBytes())); } rw.write(tableName, m); // repeat until done if (i % 100 == 0) System.out.println(i); } rw.close(context); // close when done return 0; }
From source file:org.apache.crunch.io.CrunchOutputs.java
License:Apache License
public void close() throws IOException, InterruptedException { for (RecordWriter<?, ?> writer : recordWriters.values()) { writer.close(baseContext); }/*from www. j a v a 2 s .c o m*/ }
From source file:org.apache.hcatalog.data.transfer.impl.HCatOutputFormatWriter.java
License:Apache License
@Override public void write(Iterator<HCatRecord> recordItr) throws HCatException { int id = sp.getId(); setVarsInConf(id);/*from w w w .j av a 2s . c o m*/ HCatOutputFormat outFormat = new HCatOutputFormat(); TaskAttemptContext cntxt = HCatHadoopShims.Instance.get().createTaskAttemptContext(conf, new TaskAttemptID(HCatHadoopShims.Instance.get().createTaskID(), id)); OutputCommitter committer = null; RecordWriter<WritableComparable<?>, HCatRecord> writer; try { committer = outFormat.getOutputCommitter(cntxt); committer.setupTask(cntxt); writer = outFormat.getRecordWriter(cntxt); while (recordItr.hasNext()) { HCatRecord rec = recordItr.next(); writer.write(null, rec); } writer.close(cntxt); if (committer.needsTaskCommit(cntxt)) { committer.commitTask(cntxt); } } catch (IOException e) { if (null != committer) { try { committer.abortTask(cntxt); } catch (IOException e1) { throw new HCatException(ErrorType.ERROR_INTERNAL_EXCEPTION, e1); } } throw new HCatException("Failed while writing", e); } catch (InterruptedException e) { if (null != committer) { try { committer.abortTask(cntxt); } catch (IOException e1) { throw new HCatException(ErrorType.ERROR_INTERNAL_EXCEPTION, e1); } } throw new HCatException("Failed while writing", e); } }