Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.hadoop.hbase.regionserver.wal;

import static org.apache.hadoop.hbase.HConstants.REGION_SERVER_HANDLER_COUNT;
import static;


import io.netty.util.concurrent.Future;
import io.netty.util.concurrent.Promise;
import io.netty.util.concurrent.ScheduledFuture;

import java.nio.channels.CompletionHandler;
import java.util.ArrayDeque;
import java.util.Comparator;
import java.util.Deque;
import java.util.Iterator;
import java.util.List;
import java.util.PriorityQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client.ConnectionUtils;
import org.apache.hadoop.hbase.wal.AsyncFSWALProvider;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WALProvider.AsyncWriter;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.htrace.NullScope;
import org.apache.htrace.Trace;
import org.apache.htrace.TraceScope;

 * An asynchronous implementation of FSWAL.
 * <p>
 * Here 'waitingConsumePayloads' acts as the RingBuffer in FSHLog. We do not use RingBuffer here
 * because RingBuffer need an exclusive thread to consume the entries in it, and here we want to run
 * the append and sync operation inside EventLoop. We can not use EventLoop as the RingBuffer's
 * executor otherwise the EventLoop can not process any other events such as socket read and write.
 * <p>
 * For append, we process it as follow:
 * <ol>
 * <li>In the caller thread(typically, in the rpc handler thread):
 * <ol>
 * <li>Lock 'waitingConsumePayloads', bump nextTxid, and insert the entry to
 * 'waitingConsumePayloads'.</li>
 * <li>Schedule the consumer task if needed. See {@link #shouldScheduleConsumer()} for more details.
 * </li>
 * </ol>
 * </li>
 * <li>In the consumer task(in the EventLoop thread)
 * <ol>
 * <li>Poll the entry from 'waitingConsumePayloads' and insert it into 'waitingAppendEntries'</li>
 * <li>Poll the entry from 'waitingAppendEntries', append it to the AsyncWriter, and insert it into
 * 'unackedEntries'</li>
 * <li>If the buffered size reaches {@link #batchSize}, or there is a sync request, then we call
 * sync on the AsyncWriter.</li>
 * <li>In the callback methods(CompletionHandler):
 * <ul>
 * <li>If succeeded, poll the entry from 'unackedEntries' and drop it.</li>
 * <li>If failed, add all the entries in 'unackedEntries' back to 'waitingAppendEntries' and wait
 * for writing them again.</li>
 * </ul>
 * </li>
 * </ol>
 * </li>
 * </ol>
 * For sync, the processing stages are almost same except that if it is not assigned with a new
 * 'txid', we just assign the previous 'txid' to it without bumping the 'nextTxid'. And different
 * from FSHLog, we will open a new writer and rewrite unacked entries to the new writer and sync
 * again if we hit a sync error.
 * <p>
 * Here we only describe the logic of doReplaceWriter. The main logic of rollWriter is same with
 * FSHLog.<br>
 * For a normal roll request(for example, we have reached the log roll size):
 * <ol>
 * <li>In the log roller thread, we add a roll payload to 'waitingConsumePayloads', and then wait on
 * the rollPromise(see {@link #waitForSafePoint()}).</li>
 * <li>In the consumer thread, we will stop polling entries from 'waitingConsumePayloads' if we hit
 * a Payload which contains a roll request.</li>
 * <li>Append all entries to current writer, issue a sync request if possible.</li>
 * <li>If sync succeeded, check if we could finish a roll request. There 3 conditions:
 * <ul>
 * <li>'rollPromise' is not null which means we have a pending roll request.</li>
 * <li>'waitingAppendEntries' is empty.</li>
 * <li>'unackedEntries' is empty.</li>
 * </ul>
 * </li>
 * <li>Back to the log roller thread, now we can confirm that there are no out-going entries, i.e.,
 * we reach a safe point. So it is safe to replace old writer with new writer now.</li>
 * <li>Acquire 'waitingConsumePayloads' lock, set 'writerBroken' and 'waitingRoll' to false, cancel
 * log roller exit checker if any(see the comments in the 'failed' method of the sync
 * CompletionHandler to see why we need a checker here).</li>
 * <li>Schedule the consumer task if needed.</li>
 * <li>Schedule a background task to close the old writer.</li>
 * </ol>
 * For a broken writer roll request, the only difference is that we can bypass the wait for safe
 * point stage. See the comments in the 'failed' method of the sync CompletionHandler for more
 * details.
public class AsyncFSWAL extends AbstractFSWAL<AsyncWriter> {

    private static final Log LOG = LogFactory.getLog(AsyncFSWAL.class);

    public static final String WAL_BATCH_SIZE = "hbase.wal.batch.size";
    public static final long DEFAULT_WAL_BATCH_SIZE = 64L * 1024;

    public static final String ASYNC_WAL_CREATE_MAX_RETRIES = "hbase.wal.async.create.retries";
    public static final int DEFAULT_ASYNC_WAL_CREATE_MAX_RETRIES = 10;

    public static final String ASYNC_WAL_LOG_ROLLER_EXITED_CHECK_INTERVAL_MS = "";
    public static final long DEFAULT_ASYNC_WAL_LOG_ROLLER_EXITED_CHECK_INTERVAL_MS = 1000;

     * Carry things that we want to pass to the consume task in event loop. Only one field can be
     * non-null.
     * <p>
     * TODO: need to unify this and {@link RingBufferTruck}. There are mostly the same thing.
    private static final class Payload {

        // a wal entry which need to be appended
        public final FSWALEntry entry;

        // indicate that we need to sync our wal writer.
        public final SyncFuture sync;

        // incidate that we want to roll the writer.
        public final Promise<Void> roll;

        public Payload(FSWALEntry entry) {
            this.entry = entry;
            this.sync = null;
            this.roll = null;

        public Payload(SyncFuture sync) {
            this.entry = null;
            this.sync = sync;
            this.roll = null;

        public Payload(Promise<Void> roll) {
            this.entry = null;
            this.sync = null;
            this.roll = roll;

        public String toString() {
            return "Payload [entry=" + entry + ", sync=" + sync + ", roll=" + roll + "]";

    private final EventLoop eventLoop;

    private final Deque<Payload> waitingConsumePayloads;

    // like the ringbuffer sequence. Every FSWALEntry and SyncFuture will be assigned a txid and
    // then added to waitingConsumePayloads.
    private long nextTxid = 1L;

    private boolean consumerScheduled;

    // new writer is created and we are waiting for old writer to be closed.
    private boolean waitingRoll;

    // writer is broken and rollWriter is needed.
    private boolean writerBroken;

    private final long batchSize;

    private final int createMaxRetries;

    private final long logRollerExitedCheckIntervalMs;

    private final ExecutorService closeExecutor = Executors.newCachedThreadPool(
            new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Close-WAL-Writer-%d").build());

    private volatile AsyncFSOutput fsOut;

    private final Deque<FSWALEntry> waitingAppendEntries = new ArrayDeque<FSWALEntry>();

    private final Deque<FSWALEntry> unackedEntries = new ArrayDeque<FSWALEntry>();

    private final PriorityQueue<SyncFuture> syncFutures = new PriorityQueue<SyncFuture>(11, SEQ_COMPARATOR);

    private Promise<Void> rollPromise;

    // the highest txid of WAL entries being processed
    private long highestProcessedTxid;

    // file length when we issue last sync request on the writer
    private long fileLengthAtLastSync;

    private volatile boolean logRollerExited;

    private final class LogRollerExitedChecker implements Runnable {

        private boolean cancelled;

        private ScheduledFuture<?> future;

        public synchronized void setFuture(ScheduledFuture<?> future) {
            this.future = future;

        public void run() {
            if (!logRollerExited) {
            // rollWriter is called in the log roller thread, and logRollerExited will be set just before
            // the log rolled exit. So here we can confirm that no one could cancel us if the 'canceled'
            // check passed. So it is safe to release the lock after checking 'canceled' flag.
            synchronized (this) {
                if (cancelled) {
            IOException error = new IOException("sync failed but log roller exited");
            for (SyncFuture future; (future = syncFutures.peek()) != null;) {
                future.done(highestProcessedTxid, error);
            synchronized (waitingConsumePayloads) {
                for (Payload p : waitingConsumePayloads) {
                    if (p.entry != null) {
                        try {
                        } catch (IOException e) {
                            throw new AssertionError("should not happen", e);
                    } else if (p.sync != null) {
                        p.sync.done(nextTxid, error);

        public synchronized void cancel() {
            cancelled = true;

    private LogRollerExitedChecker logRollerExitedChecker;

    public AsyncFSWAL(FileSystem fs, Path rootDir, String logDir, String archiveDir, Configuration conf,
            List<WALActionsListener> listeners, boolean failIfWALExists, String prefix, String suffix,
            EventLoop eventLoop) throws FailedLogCloseException, IOException {
        super(fs, rootDir, logDir, archiveDir, conf, listeners, failIfWALExists, prefix, suffix);
        this.eventLoop = eventLoop;
        int maxHandlersCount = conf.getInt(REGION_SERVER_HANDLER_COUNT, 200);
        waitingConsumePayloads = new ArrayDeque<Payload>(maxHandlersCount * 3);
        batchSize = conf.getLong(WAL_BATCH_SIZE, DEFAULT_WAL_BATCH_SIZE);
        logRollerExitedCheckIntervalMs = conf.getLong(ASYNC_WAL_LOG_ROLLER_EXITED_CHECK_INTERVAL_MS,

    private void tryFinishRoll() {
        // 1. a roll is requested
        // 2. we have written out all entries before the roll point.
        // 3. all entries have been acked.
        if (rollPromise != null && waitingAppendEntries.isEmpty() && unackedEntries.isEmpty()) {
            rollPromise = null;

    private void sync(final AsyncWriter writer, final long processedTxid) {
        fileLengthAtLastSync = writer.getLength();
        final long startTimeNs = System.nanoTime();
        writer.sync(new CompletionHandler<Long, Void>() {

            public void completed(Long result, Void attachment) {
                int syncCount = finishSync(true);
                for (Iterator<FSWALEntry> iter = unackedEntries.iterator(); iter.hasNext();) {
                    if ( <= processedTxid) {
                    } else {
                postSync(System.nanoTime() - startTimeNs, syncCount);
                if (!rollWriterLock.tryLock()) {
                try {
                    if (writer.getLength() >= logrollsize) {
                } finally {

            public void failed(Throwable exc, Void attachment) {
                LOG.warn("sync failed", exc);
                // Here we depends on the implementation of FanOutOneBlockAsyncDFSOutput and netty.
                // When error occur, FanOutOneBlockAsyncDFSOutput will fail all pending flush requests. It
                // is execute inside EventLoop. And in DefaultPromise in netty, it will notifyListener
                // directly if it is already in the EventLoop thread. And in the listener method, it will
                // call us. So here we know that all failed flush request will call us continuously, and
                // before the last one finish, no other task can be executed in EventLoop. So here we are
                // safe to use writerBroken as a guard.
                // Do not forget to revisit this if we change the implementation of
                // FanOutOneBlockAsyncDFSOutput!
                synchronized (waitingConsumePayloads) {
                    if (writerBroken) {
                    // schedule a periodical task to check if log roller is exited. Otherwise the the sync
                    // request maybe blocked forever since we are still waiting for a new writer to write the
                    // pending data and sync it...
                    logRollerExitedChecker = new LogRollerExitedChecker();
                    // we are currently in the EventLoop thread, so it is safe to set the future after
                    // schedule it since the task can not be executed before we release the thread.
                            logRollerExitedCheckIntervalMs, logRollerExitedCheckIntervalMs, TimeUnit.MILLISECONDS));
                    writerBroken = true;
                for (Iterator<FSWALEntry> iter = unackedEntries.descendingIterator(); iter.hasNext();) {
                highestUnsyncedTxid = highestSyncedTxid.get();
                if (rollPromise != null) {
                    rollPromise = null;
                // request a roll.
                if (!rollWriterLock.tryLock()) {
                try {
                } finally {
        }, null);

    private void addTimeAnnotation(SyncFuture future, String annotation) {
        TraceScope scope = Trace.continueSpan(future.getSpan());

    private int finishSync(boolean addSyncTrace) {
        long doneTxid = highestSyncedTxid.get();
        int finished = 0;
        for (SyncFuture future; (future = syncFutures.peek()) != null;) {
            if (future.getTxid() <= doneTxid) {
                future.done(doneTxid, null);
                addTimeAnnotation(future, "writer synced");
            } else {
        return finished;

    private void consume() {
        final AsyncWriter writer = this.writer;
        // maybe a sync request is not queued when we issue a sync, so check here to see if we could
        // finish some.
        long newHighestProcessedTxid = -1L;
        for (Iterator<FSWALEntry> iter = waitingAppendEntries.iterator(); iter.hasNext();) {
            FSWALEntry entry =;
            boolean appended;
            try {
                appended = append(writer, entry);
            } catch (IOException e) {
                throw new AssertionError("should not happen", e);
            newHighestProcessedTxid = entry.getTxid();
            if (appended) {
                if (writer.getLength() - fileLengthAtLastSync >= batchSize) {
        // if we have a newer transaction id, update it.
        // otherwise, use the previous transaction id.
        if (newHighestProcessedTxid > 0) {
            highestProcessedTxid = newHighestProcessedTxid;
        } else {
            newHighestProcessedTxid = highestProcessedTxid;
        if (writer.getLength() - fileLengthAtLastSync >= batchSize) {
            // sync because buffer size limit.
            sync(writer, newHighestProcessedTxid);
        } else if ((!syncFutures.isEmpty() || rollPromise != null) && writer.getLength() > fileLengthAtLastSync) {
            // first we should have at least one sync request or a roll request
            // second we should have some unsynced data.
            sync(writer, newHighestProcessedTxid);
        } else if (writer.getLength() == fileLengthAtLastSync) {
            // we haven't written anything out, just advance the highestSyncedSequence since we may only
            // stamped some region sequence id.

    private static final Comparator<SyncFuture> SEQ_COMPARATOR = new Comparator<SyncFuture>() {

        public int compare(SyncFuture o1, SyncFuture o2) {
            int c =, o2.getTxid());
            return c != 0 ? c :, System.identityHashCode(o2));

    private final Runnable consumer = new Runnable() {

        public void run() {
            synchronized (waitingConsumePayloads) {
                assert consumerScheduled;
                if (writerBroken) {
                    // waiting for reschedule after rollWriter.
                    consumerScheduled = false;
                if (waitingRoll) {
                    // we may have toWriteEntries if the consume method does not write all pending entries
                    // out, this is usually happen if we have too many toWriteEntries that exceeded the
                    // batchSize limit.
                    if (waitingAppendEntries.isEmpty()) {
                        consumerScheduled = false;
                } else {
                    for (Payload p; (p = waitingConsumePayloads.pollFirst()) != null;) {
                        if (p.entry != null) {
                        } else if (p.sync != null) {
                        } else {
                            rollPromise = p.roll;
                            waitingRoll = true;
            synchronized (waitingConsumePayloads) {
                if (waitingRoll) {
                    if (waitingAppendEntries.isEmpty()) {
                        consumerScheduled = false;
                } else {
                    if (waitingConsumePayloads.isEmpty() && waitingAppendEntries.isEmpty()) {
                        consumerScheduled = false;
            // reschedule if we still have something to write.

    private boolean shouldScheduleConsumer() {
        if (writerBroken || waitingRoll) {
            return false;
        if (consumerScheduled) {
            return false;
        consumerScheduled = true;
        return true;

    public long append(HRegionInfo hri, WALKey key, WALEdit edits, boolean inMemstore) throws IOException {
        boolean scheduleTask;
        long txid;
        synchronized (waitingConsumePayloads) {
            if (this.closed) {
                throw new IOException("Cannot append; log is closed");
            txid = nextTxid++;
            FSWALEntry entry = new FSWALEntry(txid, key, edits, hri, inMemstore);
            scheduleTask = shouldScheduleConsumer();
            waitingConsumePayloads.add(new Payload(entry));
        if (scheduleTask) {
        return txid;

    public void sync() throws IOException {
        TraceScope scope = Trace.startSpan("AsyncFSWAL.sync");
        try {
            SyncFuture future;
            boolean scheduleTask;
            synchronized (waitingConsumePayloads) {
                scheduleTask = shouldScheduleConsumer();
                future = getSyncFuture(nextTxid - 1, scope.detach());
                waitingConsumePayloads.addLast(new Payload(future));
            if (scheduleTask) {
            scope = Trace.continueSpan(blockOnSync(future));
        } finally {
            assert scope == NullScope.INSTANCE || !scope.isDetached();

    public void sync(long txid) throws IOException {
        if (highestSyncedTxid.get() >= txid) {
        TraceScope scope = Trace.startSpan("AsyncFSWAL.sync");
        try {
            SyncFuture future = getSyncFuture(txid, scope.detach());
            boolean scheduleTask;
            synchronized (waitingConsumePayloads) {
                scheduleTask = shouldScheduleConsumer();
                waitingConsumePayloads.addLast(new Payload(future));
            if (scheduleTask) {
            scope = Trace.continueSpan(blockOnSync(future));
        } finally {
            assert scope == NullScope.INSTANCE || !scope.isDetached();

    public void logRollerExited() {
        logRollerExited = true;

    protected AsyncWriter createWriterInstance(Path path) throws IOException {
        boolean overwrite = false;
        for (int retry = 0;; retry++) {
            try {
                return AsyncFSWALProvider.createAsyncWriter(conf, fs, path, overwrite, eventLoop);
            } catch (RemoteException e) {
                LOG.warn("create wal log writer " + path + " failed, retry = " + retry, e);
                if (shouldRetryCreate(e)) {
                    if (retry >= createMaxRetries) {
                } else {
                    throw e.unwrapRemoteException();
            } catch (NameNodeException e) {
                throw e;
            } catch (IOException e) {
                LOG.warn("create wal log writer " + path + " failed, retry = " + retry, e);
                if (retry >= createMaxRetries) {
                // overwrite the old broken file.
                overwrite = true;
                try {
                    Thread.sleep(ConnectionUtils.getPauseTime(100, retry));
                } catch (InterruptedException ie) {
                    throw new InterruptedIOException();
        throw new IOException(
                "Failed to create wal log writer " + path + " after retrying " + createMaxRetries + " time(s)");

    private void waitForSafePoint() {
        Future<Void> roll;
        boolean scheduleTask;
        synchronized (waitingConsumePayloads) {
            if (!writerBroken && this.writer != null) {
                Promise<Void> promise = eventLoop.newPromise();
                if (consumerScheduled) {
                    scheduleTask = false;
                } else {
                    scheduleTask = consumerScheduled = true;
                waitingConsumePayloads.addLast(new Payload(promise));
                roll = promise;
            } else {
                roll = eventLoop.newSucceededFuture(null);
                scheduleTask = false;
        if (scheduleTask) {

    protected long doReplaceWriter(Path oldPath, Path newPath, AsyncWriter nextWriter) throws IOException {
        final AsyncWriter oldWriter = this.writer;
        this.writer = nextWriter;
        if (nextWriter != null && nextWriter instanceof AsyncProtobufLogWriter) {
            this.fsOut = ((AsyncProtobufLogWriter) nextWriter).getOutput();
        this.fileLengthAtLastSync = 0L;
        boolean scheduleTask;
        synchronized (waitingConsumePayloads) {
            writerBroken = waitingRoll = false;
            if (logRollerExitedChecker != null) {
                logRollerExitedChecker = null;
            if (consumerScheduled) {
                scheduleTask = false;
            } else {
                if (waitingConsumePayloads.isEmpty() && waitingAppendEntries.isEmpty()) {
                    scheduleTask = false;
                } else {
                    scheduleTask = consumerScheduled = true;
        if (scheduleTask) {
        long oldFileLen;
        if (oldWriter != null) {
            oldFileLen = oldWriter.getLength();
            closeExecutor.execute(new Runnable() {

                public void run() {
                    try {
                    } catch (IOException e) {
                        LOG.warn("close old writer failed", e);
        } else {
            oldFileLen = 0L;
        return oldFileLen;

    protected void doShutdown() throws IOException {
        this.writer = null;

    protected void doAppend(AsyncWriter writer, FSWALEntry entry) {

    DatanodeInfo[] getPipeline() {
        AsyncFSOutput output = this.fsOut;
        return output != null ? output.getPipeline() : new DatanodeInfo[0];

    int getLogReplication() {
        return getPipeline().length;