Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie.common.table.log; import com.google.common.base.Preconditions; import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock; import com.uber.hoodie.common.table.log.block.HoodieCommandBlock; import com.uber.hoodie.common.table.log.block.HoodieCorruptBlock; import com.uber.hoodie.common.table.log.block.HoodieDeleteBlock; import com.uber.hoodie.common.table.log.block.HoodieLogBlock; import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; import com.uber.hoodie.exception.CorruptedLogFileException; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.exception.HoodieNotSupportedException; import java.io.EOFException; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Optional; import org.apache.avro.Schema; import org.apache.hadoop.fs.BufferedFSInputStream; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** * Scans a log file and provides block level iterator on the log file Loads the entire block * contents in memory Can emit either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one * is found) */ class HoodieLogFileReader implements HoodieLogFormat.Reader { public static final int DEFAULT_BUFFER_SIZE = 16 * 1024 * 1024; // 16 MB private static final Logger log = LogManager.getLogger(HoodieLogFileReader.class); private final FSDataInputStream inputStream; private final HoodieLogFile logFile; private static final byte[] magicBuffer = new byte[6]; private final Schema readerSchema; private HoodieLogFormat.LogFormatVersion nextBlockVersion; private boolean readBlockLazily; private long reverseLogFilePosition; private long lastReverseLogFilePosition; private boolean reverseReader; private boolean closed = false; HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, boolean readBlockLazily, boolean reverseReader) throws IOException { FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { this.inputStream = new FSDataInputStream( new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)); } else { // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream // need to wrap in another BufferedFSInputStream the make bufferSize work? this.inputStream = fsDataInputStream; } this.logFile = logFile; this.readerSchema = readerSchema; this.readBlockLazily = readBlockLazily; this.reverseReader = reverseReader; if (this.reverseReader) { this.reverseLogFilePosition = this.lastReverseLogFilePosition = fs.getFileStatus(logFile.getPath()) .getLen(); } addShutDownHook(); } HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, boolean readBlockLazily, boolean reverseReader) throws IOException { this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, readBlockLazily, reverseReader); } HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException { this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false); } @Override public HoodieLogFile getLogFile() { return logFile; } /** * Close the inputstream if not closed when the JVM exits */ private void addShutDownHook() { Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { close(); } catch (Exception e) { log.warn("unable to close input stream for log file " + logFile, e); // fail silently for any sort of exception } } }); } // TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows // for max of Integer size private HoodieLogBlock readBlock() throws IOException { int blocksize = -1; int type = -1; HoodieLogBlockType blockType = null; Map<HeaderMetadataType, String> header = null; try { // 1 Read the total size of the block blocksize = (int) inputStream.readLong(); } catch (EOFException | CorruptedLogFileException e) { // An exception reading any of the above indicates a corrupt block // Create a corrupt block by finding the next MAGIC marker or EOF return createCorruptBlock(); } // We may have had a crash which could have written this block partially // Skip blocksize in the stream and we should either find a sync marker (start of the next // block) or EOF. If we did not find either of it, then this block is a corrupted block. boolean isCorrupted = isBlockCorrupt(blocksize); if (isCorrupted) { return createCorruptBlock(); } // 2. Read the version for this log format this.nextBlockVersion = readVersion(); // 3. Read the block type for a log block if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) { type = inputStream.readInt(); Preconditions.checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type); blockType = HoodieLogBlockType.values()[type]; } // 4. Read the header for a log block, if present if (nextBlockVersion.hasHeader()) { header = HoodieLogBlock.getLogMetadata(inputStream); } int contentLength = blocksize; // 5. Read the content length for the content if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) { contentLength = (int) inputStream.readLong(); } // 6. Read the content or skip content based on IO vs Memory trade-off by client // TODO - have a max block size and reuse this buffer in the ByteBuffer // (hard to guess max block size for now) long contentPosition = inputStream.getPos(); byte[] content = HoodieLogBlock.readOrSkipContent(inputStream, contentLength, readBlockLazily); // 7. Read footer if any Map<HeaderMetadataType, String> footer = null; if (nextBlockVersion.hasFooter()) { footer = HoodieLogBlock.getLogMetadata(inputStream); } // 8. Read log block length, if present. This acts as a reverse pointer when traversing a // log file in reverse long logBlockLength = 0; if (nextBlockVersion.hasLogBlockLength()) { logBlockLength = inputStream.readLong(); } // 9. Read the log block end position in the log file long blockEndPos = inputStream.getPos(); switch (blockType) { // based on type read the block case AVRO_DATA_BLOCK: if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) { return HoodieAvroDataBlock.getBlock(content, readerSchema); } else { return HoodieAvroDataBlock.getBlock(logFile, inputStream, Optional.ofNullable(content), readBlockLazily, contentPosition, contentLength, blockEndPos, readerSchema, header, footer); } case DELETE_BLOCK: return HoodieDeleteBlock.getBlock(logFile, inputStream, Optional.ofNullable(content), readBlockLazily, contentPosition, contentLength, blockEndPos, header, footer); case COMMAND_BLOCK: return HoodieCommandBlock.getBlock(logFile, inputStream, Optional.ofNullable(content), readBlockLazily, contentPosition, contentLength, blockEndPos, header, footer); default: throw new HoodieNotSupportedException("Unsupported Block " + blockType); } } private HoodieLogBlock createCorruptBlock() throws IOException { log.info("Log " + logFile + " has a corrupted block at " + inputStream.getPos()); long currentPos = inputStream.getPos(); long nextBlockOffset = scanForNextAvailableBlockOffset(); // Rewind to the initial start and read corrupted bytes till the nextBlockOffset inputStream.seek(currentPos); log.info("Next available block in " + logFile + " starts at " + nextBlockOffset); int corruptedBlockSize = (int) (nextBlockOffset - currentPos); long contentPosition = inputStream.getPos(); byte[] corruptedBytes = HoodieLogBlock.readOrSkipContent(inputStream, corruptedBlockSize, readBlockLazily); return HoodieCorruptBlock.getBlock(logFile, inputStream, Optional.ofNullable(corruptedBytes), readBlockLazily, contentPosition, corruptedBlockSize, corruptedBlockSize, new HashMap<>(), new HashMap<>()); } private boolean isBlockCorrupt(int blocksize) throws IOException { long currentPos = inputStream.getPos(); try { inputStream.seek(currentPos + blocksize); } catch (EOFException e) { // this is corrupt // This seek is required because contract of seek() is different for naked DFSInputStream vs BufferedFSInputStream // release-3.1.0-RC1/DFSInputStream.java#L1455 // release-3.1.0-RC1/BufferedFSInputStream.java#L73 inputStream.seek(currentPos); return true; } try { readMagic(); // all good - either we found the sync marker or EOF. Reset position and continue return false; } catch (CorruptedLogFileException e) { // This is a corrupted block return true; } finally { inputStream.seek(currentPos); } } private long scanForNextAvailableBlockOffset() throws IOException { while (true) { long currentPos = inputStream.getPos(); try { boolean hasNextMagic = hasNextMagic(); if (hasNextMagic) { return currentPos; } else { // No luck - advance and try again inputStream.seek(currentPos + 1); } } catch (EOFException e) { return inputStream.getPos(); } } } @Override public void close() throws IOException { if (!closed) { this.inputStream.close(); closed = true; } } @Override /** * hasNext is not idempotent. TODO - Fix this. It is okay for now - PR */ public boolean hasNext() { try { return readMagic(); } catch (IOException e) { throw new HoodieIOException("IOException when reading logfile " + logFile, e); } } /** * Read log format version from log file. */ private HoodieLogFormat.LogFormatVersion readVersion() throws IOException { return new HoodieLogFormatVersion(inputStream.readInt()); } private boolean readMagic() throws IOException { try { boolean hasMagic = hasNextMagic(); if (!hasMagic) { throw new CorruptedLogFileException( logFile + "could not be read. Did not find the magic bytes at the start of the block"); } return hasMagic; } catch (EOFException e) { // We have reached the EOF return false; } } private boolean hasNextMagic() throws IOException { long pos = inputStream.getPos(); // 1. Read magic header from the start of the block inputStream.readFully(magicBuffer, 0, 6); if (!Arrays.equals(magicBuffer, HoodieLogFormat.MAGIC)) { return false; } return true; } @Override public HoodieLogBlock next() { try { // hasNext() must be called before next() return readBlock(); } catch (IOException io) { throw new HoodieIOException("IOException when reading logblock from log file " + logFile, io); } } /** * hasPrev is not idempotent */ @Override public boolean hasPrev() { try { if (!this.reverseReader) { throw new HoodieNotSupportedException("Reverse log reader has not been enabled"); } reverseLogFilePosition = lastReverseLogFilePosition; reverseLogFilePosition -= Long.BYTES; lastReverseLogFilePosition = reverseLogFilePosition; inputStream.seek(reverseLogFilePosition); } catch (Exception e) { // Either reached EOF while reading backwards or an exception return false; } return true; } /** * This is a reverse iterator Note: At any point, an instance of HoodieLogFileReader should either * iterate reverse (prev) or forward (next). Doing both in the same instance is not supported * WARNING : Every call to prev() should be preceded with hasPrev() */ @Override public HoodieLogBlock prev() throws IOException { if (!this.reverseReader) { throw new HoodieNotSupportedException("Reverse log reader has not been enabled"); } long blockSize = inputStream.readLong(); long blockEndPos = inputStream.getPos(); // blocksize should read everything about a block including the length as well try { inputStream.seek(reverseLogFilePosition - blockSize); } catch (Exception e) { // this could be a corrupt block inputStream.seek(blockEndPos); throw new CorruptedLogFileException("Found possible corrupted block, cannot read log file in reverse, " + "fallback to forward reading of logfile"); } boolean hasNext = hasNext(); reverseLogFilePosition -= blockSize; lastReverseLogFilePosition = reverseLogFilePosition; return next(); } /** * Reverse pointer, does not read the block. Return the current position of the log file (in * reverse) If the pointer (inputstream) is moved in any way, it is the job of the client of this * class to seek/reset it back to the file position returned from the method to expect correct * results */ public long moveToPrev() throws IOException { if (!this.reverseReader) { throw new HoodieNotSupportedException("Reverse log reader has not been enabled"); } inputStream.seek(lastReverseLogFilePosition); long blockSize = inputStream.readLong(); // blocksize should be everything about a block including the length as well inputStream.seek(reverseLogFilePosition - blockSize); reverseLogFilePosition -= blockSize; lastReverseLogFilePosition = reverseLogFilePosition; return reverseLogFilePosition; } @Override public void remove() { throw new UnsupportedOperationException("Remove not supported for HoodieLogFileReader"); } }