Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.tuple.hadoop.collect; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import cascading.flow.FlowProcess; import cascading.flow.FlowProcessWrapper; import cascading.tuple.TupleException; import cascading.tuple.collect.SpillableTupleList; import cascading.tuple.hadoop.TupleSerialization; import cascading.tuple.hadoop.io.HadoopTupleInputStream; import cascading.tuple.hadoop.io.HadoopTupleOutputStream; import cascading.tuple.io.TupleInputStream; import cascading.tuple.io.TupleOutputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.util.ReflectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * SpillableTupleList is a simple {@link Iterable} object that can store an unlimited number of {@link cascading.tuple.Tuple} instances by spilling * excess to a temporary disk file. * <p/> * Spills will automatically be compressed using the {@link #defaultCodecs} values. To disable compression or * change the codecs, see {@link cascading.tuple.collect.SpillableProps#SPILL_COMPRESS} and {@link cascading.tuple.collect.SpillableProps#SPILL_CODECS}. * <p/> * It is recommended to add Lzo if available. * {@code "org.apache.hadoop.io.compress.LzoCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec" } */ public class HadoopSpillableTupleList extends SpillableTupleList { private static final Logger LOG = LoggerFactory.getLogger(HadoopSpillableTupleList.class); public static final String defaultCodecs = "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec"; /** Field codec */ private final CompressionCodec codec; /** Field serializationElementWriter */ private final TupleSerialization tupleSerialization; public static synchronized CompressionCodec getCodec(FlowProcess<? extends Configuration> flowProcess, String defaultCodecs) { Class<? extends CompressionCodec> codecClass = getCodecClass(flowProcess, defaultCodecs, CompressionCodec.class); if (codecClass == null) return null; if (flowProcess instanceof FlowProcessWrapper) flowProcess = ((FlowProcessWrapper) flowProcess).getDelegate(); return ReflectionUtils.newInstance(codecClass, flowProcess.getConfig()); } /** * Constructor SpillableTupleList creates a new SpillableTupleList instance using the given threshold value, and * the first available compression codec, if any. * * @param threshold of type long * @param codec of type CompressionCodec */ public HadoopSpillableTupleList(int threshold, CompressionCodec codec, Configuration configuration) { super(threshold); this.codec = codec; if (configuration == null) this.tupleSerialization = new TupleSerialization(); else this.tupleSerialization = new TupleSerialization(configuration); } public HadoopSpillableTupleList(int threshold, TupleSerialization tupleSerialization, CompressionCodec codec) { super(threshold); this.tupleSerialization = tupleSerialization; this.codec = codec; } @Override protected TupleOutputStream createTupleOutputStream(File file) { OutputStream outputStream; try { outputStream = new FileOutputStream(file); Compressor compressor = null; if (codec != null) { compressor = getCompressor(); outputStream = codec.createOutputStream(outputStream, compressor); } final Compressor finalCompressor = compressor; return new HadoopTupleOutputStream(outputStream, tupleSerialization.getElementWriter()) { @Override public void close() throws IOException { try { super.close(); } finally { if (finalCompressor != null) CodecPool.returnCompressor(finalCompressor); } } }; } catch (IOException exception) { throw new TupleException("unable to create temporary file input stream", exception); } } private Compressor getCompressor() { // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up // so we attempt to force a gc if we see a OOME once. try { return CodecPool.getCompressor(codec); } catch (OutOfMemoryError error) { System.gc(); LOG.info("received OOME when allocating compressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error); return CodecPool.getCompressor(codec); } } @Override protected TupleInputStream createTupleInputStream(File file) { try { InputStream inputStream; inputStream = new FileInputStream(file); Decompressor decompressor = null; if (codec != null) { decompressor = getDecompressor(); inputStream = codec.createInputStream(inputStream, decompressor); } final Decompressor finalDecompressor = decompressor; return new HadoopTupleInputStream(inputStream, tupleSerialization.getElementReader()) { @Override public void close() throws IOException { try { super.close(); } finally { if (finalDecompressor != null) CodecPool.returnDecompressor(finalDecompressor); } } }; } catch (IOException exception) { throw new TupleException("unable to create temporary file output stream", exception); } } private Decompressor getDecompressor() { // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up // so we attempt to force a gc if we see a OOME once. try { return CodecPool.getDecompressor(codec); } catch (OutOfMemoryError error) { System.gc(); LOG.info("received OOME when allocating decompressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error); return CodecPool.getDecompressor(codec); } } }