Java tutorial
/* * Copyright 2017 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.cloud.bigtable.beam.sequencefiles; import static com.google.common.base.Preconditions.checkState; import avro.shaded.com.google.common.collect.Sets; import com.google.common.base.Preconditions; import com.google.common.primitives.UnsignedBytes; import java.io.EOFException; import java.io.IOException; import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.channels.ReadableByteChannel; import java.nio.channels.SeekableByteChannel; import java.util.List; import java.util.NoSuchElementException; import java.util.Set; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.io.FileBasedSource; import org.apache.beam.sdk.io.FileBasedSource.FileBasedReader; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.values.KV; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSInputStream; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.serializer.Serialization; import org.apache.hadoop.util.ReflectionUtils; /** * A {@link FileBasedSource} that can read hadoop's {@link SequenceFile}s. * * @param <K> The type of the {@link SequenceFile} key. * @param <V> The type of the {@link SequenceFile} value. */ class SequenceFileSource<K, V> extends FileBasedSource<KV<K, V>> { private static final Log LOG = LogFactory.getLog(SequenceFileSource.class); private final Class<K> keyClass; private final Class<V> valueClass; private final Class<? extends Serialization<? super K>> keySerializationClass; private final Class<? extends Serialization<? super V>> valueSerializationClass; private final KvCoder<K, V> coder; /** * Constructs a new top level source. * * @param fileOrPatternSpec The path or pattern of the file(s) to read. * @param keyClass The {@link Class} of the key. * @param keySerialization The {@link Class} of the hadoop {@link org.apache.hadoop.io.serializer.Serialization} * to use for the key. * @param valueClass The {@link Class} of the value. * @param valueSerialization The {@link Class} of the hadoop {@link * org.apache.hadoop.io.serializer.Serialization} to use for the value. */ SequenceFileSource(ValueProvider<String> fileOrPatternSpec, Class<K> keyClass, Class<? extends Serialization<? super K>> keySerialization, Class<V> valueClass, Class<? extends Serialization<? super V>> valueSerialization, long minBundleSize) { super(fileOrPatternSpec, minBundleSize); Preconditions.checkArgument(minBundleSize >= SequenceFile.SYNC_INTERVAL, "minBundleSize must be at least " + SequenceFile.SYNC_INTERVAL); this.keyClass = keyClass; this.valueClass = valueClass; this.keySerializationClass = keySerialization; this.valueSerializationClass = valueSerialization; this.coder = KvCoder.of(new HadoopSerializationCoder<>(keyClass, keySerialization), new HadoopSerializationCoder<>(valueClass, valueSerialization)); } /** * Constructs a subsource for a given range. * * @param fileMetadata specification of the file represented by the {@link SequenceFileSource}, in * suitable form for use with {@link FileSystems#match(List)}. * @param startOffset starting byte offset. * @param endOffset ending byte offset. If the specified value {@code >= #getMaxEndOffset()} it * implies {@code #getMaxEndOffSet()}. * @param keyClass The {@link Class} of the key. * @param keySerialization The {@link Class} of the hadoop {@link org.apache.hadoop.io.serializer.Serialization} * to use for the key. * @param valueClass The {@link Class} of the value. * @param valueSerialization The {@link Class} of the hadoop {@link * org.apache.hadoop.io.serializer.Serialization} to use for the value. */ private SequenceFileSource(Metadata fileMetadata, long startOffset, long endOffset, Class<K> keyClass, Class<? extends Serialization<? super K>> keySerialization, Class<V> valueClass, Class<? extends Serialization<? super V>> valueSerialization, long minBundleSize, KvCoder<K, V> coder) { super(fileMetadata, minBundleSize, startOffset, endOffset); this.keyClass = keyClass; this.valueClass = valueClass; this.keySerializationClass = keySerialization; this.valueSerializationClass = valueSerialization; this.coder = coder; } /** * {@inheritDoc} */ @Override protected FileBasedSource<KV<K, V>> createForSubrangeOfFile(Metadata fileMetadata, long start, long end) { LOG.debug("Creating source for subrange: " + start + "-" + end); return new SequenceFileSource<>(fileMetadata, start, end, keyClass, keySerializationClass, valueClass, valueSerializationClass, getMinBundleSize(), coder); } /** * {@inheritDoc} */ @Override protected FileBasedReader<KV<K, V>> createSingleFileReader(PipelineOptions options) { Set<String> serializationNames = Sets.newHashSet(keySerializationClass.getName(), valueSerializationClass.getName()); return new SeqFileReader<>(this, keyClass, valueClass, serializationNames.toArray(new String[serializationNames.size()])); } /** * {@inheritDoc} */ @Override public Coder<KV<K, V>> getDefaultOutputCoder() { return coder; } /** * A {@link FileBasedReader} for reading records from a {@link SequenceFile}. * * @param <K> The type of the record keys. * @param <V> The type of the record values. */ static class SeqFileReader<K, V> extends FileBasedReader<KV<K, V>> { private final Class<K> keyClass; private final Class<V> valueClass; private final String[] serializationNames; private SequenceFile.Reader reader; // Sync is consumed during startReading(), so we need to track that for the first call of // readNextRecord private boolean isFirstRecord; private boolean isAtSplitPoint; private boolean eof; private long startOfNextRecord; private long startOfRecord; private KV<K, V> record; SeqFileReader(FileBasedSource<KV<K, V>> source, Class<K> keyClass, Class<V> valueClass, String[] serializationNames) { super(source); this.keyClass = keyClass; this.valueClass = valueClass; this.serializationNames = serializationNames; } /** * {@inheritDoc} */ @Override protected void startReading(ReadableByteChannel channel) throws IOException { checkState(channel instanceof SeekableByteChannel, "%s only supports reading from a SeekableByteChannel", SequenceFileSource.class.getSimpleName()); SeekableByteChannel seekableByteChannel = (SeekableByteChannel) channel; FileStream fileStream = new FileStream(seekableByteChannel); FSDataInputStream fsDataInputStream = new FSDataInputStream(fileStream); // Construct the underlying SequenceFile.Reader Configuration configuration = new Configuration(false); if (serializationNames.length > 0) { configuration.setStrings("io.serializations", serializationNames); } reader = new SequenceFile.Reader(configuration, SequenceFile.Reader.stream(fsDataInputStream)); // Seek to the start of the next closest sync point try { reader.sync(getCurrentSource().getStartOffset()); } catch (EOFException e) { LOG.debug("Found EOF when starting to read: " + getCurrentSource().getStartOffset()); eof = true; } // Prep for the next readNextRecord() call startOfNextRecord = reader.getPosition(); isFirstRecord = true; LOG.debug("startReading, offset: " + getCurrentSource().getStartOffset() + ", position: " + startOfNextRecord); } /** * {@inheritDoc} */ @Override public void close() throws IOException { if (reader != null) { reader.close(); } super.close(); } /** * {@inheritDoc} */ @Override protected boolean readNextRecord() throws IOException { if (eof) { return false; } K key = ReflectionUtils.newInstance(keyClass, null); V value = ReflectionUtils.newInstance(valueClass, null); startOfRecord = startOfNextRecord; try { eof = reader.next(key) == null; } catch (EOFException e) { eof = true; } if (eof) { record = null; } else { value = readCurrentValueUnchecked(value); record = KV.of(key, value); } isAtSplitPoint = isFirstRecord || reader.syncSeen(); isFirstRecord = false; startOfNextRecord = reader.getPosition(); return record != null; } @SuppressWarnings("unchecked") private V readCurrentValueUnchecked(V value) throws IOException { return (V) reader.getCurrentValue(value); } /** * {@inheritDoc} */ @Override protected boolean isAtSplitPoint() throws NoSuchElementException { return isAtSplitPoint; } /** * {@inheritDoc} */ @Override protected long getCurrentOffset() throws NoSuchElementException { if (record == null) { throw new NoSuchElementException(); } return startOfRecord; } /** * {@inheritDoc} */ @Override public KV<K, V> getCurrent() throws NoSuchElementException { if (record == null) { throw new NoSuchElementException(); } return record; } } /** * Adapter to convert a Beam {@link SeekableByteChannel} to hadoop's {@link FSDataInputStream}. */ static class FileStream extends FSInputStream { private final SeekableByteChannel inner; private final ByteBuffer singleByteBuffer = ByteBuffer.allocate(1); FileStream(SeekableByteChannel inner) { this.inner = inner; } /** * {@inheritDoc} */ @Override public void seek(long l) throws IOException { inner.position(l); } /** * {@inheritDoc} */ @Override public long getPos() throws IOException { return inner.position(); } /** * {@inheritDoc} */ @Override public boolean seekToNewSource(long l) throws IOException { return false; } /** * {@inheritDoc} */ @Override public int read(byte[] buffer, int offset, int length) throws IOException { ByteBuffer byteBuffer = ByteBuffer.wrap(buffer, offset, length); return inner.read(byteBuffer); } /** * {@inheritDoc} */ @Override public int read() throws IOException { int numRead = 0; // Workaround Java 9 overridden methods with covariant return types ((Buffer) singleByteBuffer).clear(); while (numRead == 0) { numRead = inner.read(singleByteBuffer); } if (numRead == -1) { return -1; } return UnsignedBytes.toInt(singleByteBuffer.get(0)); } } }