org.apache.accumulo.master.TabletGroupWatcher.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.accumulo.master.TabletGroupWatcher.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.master;

import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
import static java.lang.Math.min;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;

import org.apache.accumulo.core.Constants;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.BatchWriterConfig;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.MutationsRejectedException;
import org.apache.accumulo.core.client.RowIterator;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.PartialKey;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.data.impl.KeyExtent;
import org.apache.accumulo.core.master.state.tables.TableState;
import org.apache.accumulo.core.master.thrift.MasterState;
import org.apache.accumulo.core.master.thrift.TabletServerStatus;
import org.apache.accumulo.core.metadata.MetadataTable;
import org.apache.accumulo.core.metadata.RootTable;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ChoppedColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.CurrentLocationColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.DataFileColumnFamily;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.FutureLocationColumnFamily;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException;
import org.apache.accumulo.core.util.Daemon;
import org.apache.accumulo.master.Master.TabletGoalState;
import org.apache.accumulo.master.state.MergeStats;
import org.apache.accumulo.master.state.TableCounts;
import org.apache.accumulo.master.state.TableStats;
import org.apache.accumulo.server.ServerConstants;
import org.apache.accumulo.server.conf.TableConfiguration;
import org.apache.accumulo.server.fs.FileRef;
import org.apache.accumulo.server.fs.VolumeManager.FileType;
import org.apache.accumulo.server.log.WalStateManager;
import org.apache.accumulo.server.log.WalStateManager.WalMarkerException;
import org.apache.accumulo.server.master.LiveTServerSet.TServerConnection;
import org.apache.accumulo.server.master.state.Assignment;
import org.apache.accumulo.server.master.state.ClosableIterator;
import org.apache.accumulo.server.master.state.DistributedStoreException;
import org.apache.accumulo.server.master.state.MergeInfo;
import org.apache.accumulo.server.master.state.MergeState;
import org.apache.accumulo.server.master.state.TServerInstance;
import org.apache.accumulo.server.master.state.TabletLocationState;
import org.apache.accumulo.server.master.state.TabletLocationState.BadLocationStateException;
import org.apache.accumulo.server.master.state.TabletState;
import org.apache.accumulo.server.master.state.TabletStateStore;
import org.apache.accumulo.server.tables.TableManager;
import org.apache.accumulo.server.tablets.TabletTime;
import org.apache.accumulo.server.util.MetadataTableUtil;
import org.apache.accumulo.server.zookeeper.ZooReaderWriter;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.thrift.TException;

import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Iterators;

abstract class TabletGroupWatcher extends Daemon {
    // Constants used to make sure assignment logging isn't excessive in quantity or size
    private static final String ASSIGNMENT_BUFFER_SEPARATOR = ", ";
    private static final int ASSINGMENT_BUFFER_MAX_LENGTH = 4096;

    private final Master master;
    final TabletStateStore store;
    final TabletGroupWatcher dependentWatcher;

    private MasterState masterState;

    final TableStats stats = new TableStats();
    private SortedSet<TServerInstance> lastScanServers = ImmutableSortedSet.of();

    TabletGroupWatcher(Master master, TabletStateStore store, TabletGroupWatcher dependentWatcher) {
        this.master = master;
        this.store = store;
        this.dependentWatcher = dependentWatcher;
    }

    /** Should this {@code TabletGroupWatcher} suspend tablets? */
    abstract boolean canSuspendTablets();

    Map<String, TableCounts> getStats() {
        return stats.getLast();
    }

    // returns the master state under which stats were collected
    MasterState statsState() {
        return masterState;
    }

    TableCounts getStats(String tableId) {
        return stats.getLast(tableId);
    }

    /** True if the collection of live tservers specified in 'candidates' hasn't changed since the last time an assignment scan was started. */
    public synchronized boolean isSameTserversAsLastScan(Set<TServerInstance> candidates) {
        return candidates.equals(lastScanServers);
    }

    @Override
    public void run() {
        Thread.currentThread().setName("Watching " + store.name());
        int[] oldCounts = new int[TabletState.values().length];
        EventCoordinator.Listener eventListener = this.master.nextEvent.getListener();

        WalStateManager wals = new WalStateManager(master.getInstance(), ZooReaderWriter.getInstance());

        while (this.master.stillMaster()) {
            // slow things down a little, otherwise we spam the logs when there are many wake-up events
            sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
            masterState = master.getMasterState();

            int totalUnloaded = 0;
            int unloaded = 0;
            ClosableIterator<TabletLocationState> iter = null;
            try {
                Map<String, MergeStats> mergeStatsCache = new HashMap<>();
                Map<String, MergeStats> currentMerges = new HashMap<>();
                for (MergeInfo merge : master.merges()) {
                    if (merge.getExtent() != null) {
                        currentMerges.put(merge.getExtent().getTableId(), new MergeStats(merge));
                    }
                }

                // Get the current status for the current list of tservers
                SortedMap<TServerInstance, TabletServerStatus> currentTServers = new TreeMap<>();
                for (TServerInstance entry : this.master.tserverSet.getCurrentServers()) {
                    currentTServers.put(entry, this.master.tserverStatus.get(entry));
                }

                if (currentTServers.size() == 0) {
                    eventListener.waitForEvents(Master.TIME_TO_WAIT_BETWEEN_SCANS);
                    synchronized (this) {
                        lastScanServers = ImmutableSortedSet.of();
                    }
                    continue;
                }

                // Don't move tablets to servers that are shutting down
                SortedMap<TServerInstance, TabletServerStatus> destinations = new TreeMap<>(currentTServers);
                destinations.keySet().removeAll(this.master.serversToShutdown);

                List<Assignment> assignments = new ArrayList<>();
                List<Assignment> assigned = new ArrayList<>();
                List<TabletLocationState> assignedToDeadServers = new ArrayList<>();
                List<TabletLocationState> suspendedToGoneServers = new ArrayList<>();
                Map<KeyExtent, TServerInstance> unassigned = new HashMap<>();
                Map<TServerInstance, List<Path>> logsForDeadServers = new TreeMap<>();

                MasterState masterState = master.getMasterState();
                int[] counts = new int[TabletState.values().length];
                stats.begin();
                // Walk through the tablets in our store, and work tablets
                // towards their goal
                iter = store.iterator();
                while (iter.hasNext()) {
                    TabletLocationState tls = iter.next();
                    if (tls == null) {
                        continue;
                    }
                    Master.log.debug(store.name() + " location State: " + tls);
                    // ignore entries for tables that do not exist in zookeeper
                    if (TableManager.getInstance().getTableState(tls.extent.getTableId()) == null)
                        continue;

                    if (Master.log.isTraceEnabled())
                        Master.log.trace(tls + " walogs " + tls.walogs.size());

                    // Don't overwhelm the tablet servers with work
                    if (unassigned.size() + unloaded > Master.MAX_TSERVER_WORK_CHUNK * currentTServers.size()) {
                        flushChanges(destinations, assignments, assigned, assignedToDeadServers, logsForDeadServers,
                                suspendedToGoneServers, unassigned);
                        assignments.clear();
                        assigned.clear();
                        assignedToDeadServers.clear();
                        suspendedToGoneServers.clear();
                        unassigned.clear();
                        unloaded = 0;
                        eventListener.waitForEvents(Master.TIME_TO_WAIT_BETWEEN_SCANS);
                    }
                    String tableId = tls.extent.getTableId();
                    TableConfiguration tableConf = this.master.getConfigurationFactory()
                            .getTableConfiguration(tableId);

                    MergeStats mergeStats = mergeStatsCache.get(tableId);
                    if (mergeStats == null) {
                        mergeStats = currentMerges.get(tableId);
                        if (mergeStats == null) {
                            mergeStats = new MergeStats(new MergeInfo());
                        }
                        mergeStatsCache.put(tableId, mergeStats);
                    }
                    TabletGoalState goal = this.master.getGoalState(tls, mergeStats.getMergeInfo());
                    TServerInstance server = tls.getServer();
                    TabletState state = tls.getState(currentTServers.keySet());
                    if (Master.log.isTraceEnabled()) {
                        Master.log.trace("Goal state " + goal + " current " + state + " for " + tls.extent);
                    }
                    stats.update(tableId, state);
                    mergeStats.update(tls.extent, state, tls.chopped, !tls.walogs.isEmpty());
                    sendChopRequest(mergeStats.getMergeInfo(), state, tls);
                    sendSplitRequest(mergeStats.getMergeInfo(), state, tls);

                    // Always follow through with assignments
                    if (state == TabletState.ASSIGNED) {
                        goal = TabletGoalState.HOSTED;
                    }

                    // if we are shutting down all the tabletservers, we have to do it in order
                    if (goal == TabletGoalState.SUSPENDED && state == TabletState.HOSTED) {
                        if (this.master.serversToShutdown.equals(currentTServers.keySet())) {
                            if (dependentWatcher != null && dependentWatcher.assignedOrHosted() > 0) {
                                goal = TabletGoalState.HOSTED;
                            }
                        }
                    }

                    if (goal == TabletGoalState.HOSTED) {
                        if (state != TabletState.HOSTED && !tls.walogs.isEmpty()) {
                            if (this.master.recoveryManager.recoverLogs(tls.extent, tls.walogs))
                                continue;
                        }
                        switch (state) {
                        case HOSTED:
                            if (server.equals(this.master.migrations.get(tls.extent)))
                                this.master.migrations.remove(tls.extent);
                            break;
                        case ASSIGNED_TO_DEAD_SERVER:
                            assignedToDeadServers.add(tls);
                            if (server.equals(this.master.migrations.get(tls.extent)))
                                this.master.migrations.remove(tls.extent);
                            TServerInstance tserver = tls.futureOrCurrent();
                            if (!logsForDeadServers.containsKey(tserver)) {
                                logsForDeadServers.put(tserver, wals.getWalsInUse(tserver));
                            }
                            break;
                        case SUSPENDED:
                            if (master.getSteadyTime() - tls.suspend.suspensionTime < tableConf
                                    .getTimeInMillis(Property.TABLE_SUSPEND_DURATION)) {
                                // Tablet is suspended. See if its tablet server is back.
                                TServerInstance returnInstance = null;
                                Iterator<TServerInstance> find = destinations
                                        .tailMap(new TServerInstance(tls.suspend.server, " ")).keySet().iterator();
                                if (find.hasNext()) {
                                    TServerInstance found = find.next();
                                    if (found.getLocation().equals(tls.suspend.server)) {
                                        returnInstance = found;
                                    }
                                }

                                // Old tablet server is back. Return this tablet to its previous owner.
                                if (returnInstance != null) {
                                    assignments.add(new Assignment(tls.extent, returnInstance));
                                } else {
                                    // leave suspended, don't ask for a new assignment.
                                }
                            } else {
                                // Treat as unassigned, ask for a new assignment.
                                unassigned.put(tls.extent, server);
                            }
                            break;
                        case UNASSIGNED:
                            // maybe it's a finishing migration
                            TServerInstance dest = this.master.migrations.get(tls.extent);
                            if (dest != null) {
                                // if destination is still good, assign it
                                if (destinations.keySet().contains(dest)) {
                                    assignments.add(new Assignment(tls.extent, dest));
                                } else {
                                    // get rid of this migration
                                    this.master.migrations.remove(tls.extent);
                                    unassigned.put(tls.extent, server);
                                }
                            } else {
                                unassigned.put(tls.extent, server);
                            }
                            break;
                        case ASSIGNED:
                            // Send another reminder
                            assigned.add(new Assignment(tls.extent, tls.future));
                            break;
                        }
                    } else {
                        switch (state) {
                        case SUSPENDED:
                            // Request a move to UNASSIGNED, so as to allow balancing to continue.
                            suspendedToGoneServers.add(tls);
                            cancelOfflineTableMigrations(tls);
                            break;
                        case UNASSIGNED:
                            cancelOfflineTableMigrations(tls);
                            break;
                        case ASSIGNED_TO_DEAD_SERVER:
                            assignedToDeadServers.add(tls);
                            if (!logsForDeadServers.containsKey(tls.futureOrCurrent())) {
                                logsForDeadServers.put(tls.futureOrCurrent(),
                                        wals.getWalsInUse(tls.futureOrCurrent()));
                            }
                            break;
                        case HOSTED:
                            TServerConnection conn = this.master.tserverSet.getConnection(server);
                            if (conn != null) {
                                conn.unloadTablet(this.master.masterLock, tls.extent, goal.howUnload(),
                                        master.getSteadyTime());
                                unloaded++;
                                totalUnloaded++;
                            } else {
                                Master.log.warn("Could not connect to server " + server);
                            }
                            break;
                        case ASSIGNED:
                            break;
                        }
                    }
                    counts[state.ordinal()]++;
                }

                flushChanges(destinations, assignments, assigned, assignedToDeadServers, logsForDeadServers,
                        suspendedToGoneServers, unassigned);

                // provide stats after flushing changes to avoid race conditions w/ delete table
                stats.end(masterState);

                // Report changes
                for (TabletState state : TabletState.values()) {
                    int i = state.ordinal();
                    if (counts[i] > 0 && counts[i] != oldCounts[i]) {
                        this.master.nextEvent.event("[%s]: %d tablets are %s", store.name(), counts[i],
                                state.name());
                    }
                }
                Master.log.debug(
                        String.format("[%s]: scan time %.2f seconds", store.name(), stats.getScanTime() / 1000.));
                oldCounts = counts;
                if (totalUnloaded > 0) {
                    this.master.nextEvent.event("[%s]: %d tablets unloaded", store.name(), totalUnloaded);
                }

                updateMergeState(mergeStatsCache);

                synchronized (this) {
                    lastScanServers = ImmutableSortedSet.copyOf(currentTServers.keySet());
                }
                if (this.master.tserverSet.getCurrentServers().equals(currentTServers.keySet())) {
                    Master.log.debug(String.format("[%s] sleeping for %.2f seconds", store.name(),
                            Master.TIME_TO_WAIT_BETWEEN_SCANS / 1000.));
                    eventListener.waitForEvents(Master.TIME_TO_WAIT_BETWEEN_SCANS);
                } else {
                    Master.log.info("Detected change in current tserver set, re-running state machine.");
                }
            } catch (Exception ex) {
                Master.log.error("Error processing table state for store " + store.name(), ex);
                if (ex.getCause() != null && ex.getCause() instanceof BadLocationStateException) {
                    repairMetadata(((BadLocationStateException) ex.getCause()).getEncodedEndRow());
                } else {
                    sleepUninterruptibly(Master.WAIT_BETWEEN_ERRORS, TimeUnit.MILLISECONDS);
                }
            } finally {
                if (iter != null) {
                    try {
                        iter.close();
                    } catch (IOException ex) {
                        Master.log.warn("Error closing TabletLocationState iterator: " + ex, ex);
                    }
                }
            }
        }
    }

    private void cancelOfflineTableMigrations(TabletLocationState tls) {
        TServerInstance dest = this.master.migrations.get(tls.extent);
        TableState tableState = TableManager.getInstance().getTableState(tls.extent.getTableId());
        if (dest != null && tableState == TableState.OFFLINE) {
            this.master.migrations.remove(tls.extent);
        }
    }

    private void repairMetadata(Text row) {
        Master.log.debug("Attempting repair on " + row);
        // ACCUMULO-2261 if a dying tserver writes a location before its lock information propagates, it may cause duplicate assignment.
        // Attempt to find the dead server entry and remove it.
        try {
            Map<Key, Value> future = new HashMap<>();
            Map<Key, Value> assigned = new HashMap<>();
            KeyExtent extent = new KeyExtent(row, new Value(new byte[] { 0 }));
            String table = MetadataTable.NAME;
            if (extent.isMeta())
                table = RootTable.NAME;
            Scanner scanner = this.master.getConnector().createScanner(table, Authorizations.EMPTY);
            scanner.fetchColumnFamily(CurrentLocationColumnFamily.NAME);
            scanner.fetchColumnFamily(FutureLocationColumnFamily.NAME);
            scanner.setRange(new Range(row));
            for (Entry<Key, Value> entry : scanner) {
                if (entry.getKey().getColumnFamily().equals(CurrentLocationColumnFamily.NAME)) {
                    assigned.put(entry.getKey(), entry.getValue());
                } else if (entry.getKey().getColumnFamily().equals(FutureLocationColumnFamily.NAME)) {
                    future.put(entry.getKey(), entry.getValue());
                }
            }
            if (future.size() > 0 && assigned.size() > 0) {
                Master.log.warn("Found a tablet assigned and hosted, attempting to repair");
            } else if (future.size() > 1 && assigned.size() == 0) {
                Master.log.warn("Found a tablet assigned to multiple servers, attempting to repair");
            } else if (future.size() == 0 && assigned.size() > 1) {
                Master.log.warn("Found a tablet hosted on multiple servers, attempting to repair");
            } else {
                Master.log.info(
                        "Attempted a repair, but nothing seems to be obviously wrong. " + assigned + " " + future);
                return;
            }
            Iterator<Entry<Key, Value>> iter = Iterators.concat(future.entrySet().iterator(),
                    assigned.entrySet().iterator());
            while (iter.hasNext()) {
                Entry<Key, Value> entry = iter.next();
                TServerInstance alive = master.tserverSet.find(entry.getValue().toString());
                if (alive == null) {
                    Master.log.info("Removing entry " + entry);
                    BatchWriter bw = this.master.getConnector().createBatchWriter(table, new BatchWriterConfig());
                    Mutation m = new Mutation(entry.getKey().getRow());
                    m.putDelete(entry.getKey().getColumnFamily(), entry.getKey().getColumnQualifier());
                    bw.addMutation(m);
                    bw.close();
                    return;
                }
            }
            Master.log.error("Metadata table is inconsistent at " + row
                    + " and all assigned/future tservers are still online.");
        } catch (Throwable e) {
            Master.log.error("Error attempting repair of metadata " + row + ": " + e, e);
        }
    }

    private int assignedOrHosted() {
        int result = 0;
        for (TableCounts counts : stats.getLast().values()) {
            result += counts.assigned() + counts.hosted();
        }
        return result;
    }

    private void sendSplitRequest(MergeInfo info, TabletState state, TabletLocationState tls) {
        // Already split?
        if (!info.getState().equals(MergeState.SPLITTING))
            return;
        // Merges don't split
        if (!info.isDelete())
            return;
        // Online and ready to split?
        if (!state.equals(TabletState.HOSTED))
            return;
        // Does this extent cover the end points of the delete?
        KeyExtent range = info.getExtent();
        if (tls.extent.overlaps(range)) {
            for (Text splitPoint : new Text[] { range.getPrevEndRow(), range.getEndRow() }) {
                if (splitPoint == null)
                    continue;
                if (!tls.extent.contains(splitPoint))
                    continue;
                if (splitPoint.equals(tls.extent.getEndRow()))
                    continue;
                if (splitPoint.equals(tls.extent.getPrevEndRow()))
                    continue;
                try {
                    TServerConnection conn;
                    conn = this.master.tserverSet.getConnection(tls.current);
                    if (conn != null) {
                        Master.log.info("Asking " + tls.current + " to split " + tls.extent + " at " + splitPoint);
                        conn.splitTablet(this.master.masterLock, tls.extent, splitPoint);
                    } else {
                        Master.log.warn("Not connected to server " + tls.current);
                    }
                } catch (NotServingTabletException e) {
                    Master.log.debug("Error asking tablet server to split a tablet: " + e);
                } catch (Exception e) {
                    Master.log.warn("Error asking tablet server to split a tablet: " + e);
                }
            }
        }
    }

    private void sendChopRequest(MergeInfo info, TabletState state, TabletLocationState tls) {
        // Don't bother if we're in the wrong state
        if (!info.getState().equals(MergeState.WAITING_FOR_CHOPPED))
            return;
        // Tablet must be online
        if (!state.equals(TabletState.HOSTED))
            return;
        // Tablet isn't already chopped
        if (tls.chopped)
            return;
        // Tablet ranges intersect
        if (info.needsToBeChopped(tls.extent)) {
            TServerConnection conn;
            try {
                conn = this.master.tserverSet.getConnection(tls.current);
                if (conn != null) {
                    Master.log.info("Asking " + tls.current + " to chop " + tls.extent);
                    conn.chop(this.master.masterLock, tls.extent);
                } else {
                    Master.log.warn("Could not connect to server " + tls.current);
                }
            } catch (TException e) {
                Master.log.warn("Communications error asking tablet server to chop a tablet");
            }
        }
    }

    private void updateMergeState(Map<String, MergeStats> mergeStatsCache) {
        for (MergeStats stats : mergeStatsCache.values()) {
            try {
                MergeState update = stats.nextMergeState(this.master.getConnector(), this.master);
                // when next state is MERGING, its important to persist this before
                // starting the merge... the verification check that is done before
                // moving into the merging state could fail if merge starts but does
                // not finish
                if (update == MergeState.COMPLETE)
                    update = MergeState.NONE;
                if (update != stats.getMergeInfo().getState()) {
                    this.master.setMergeState(stats.getMergeInfo(), update);
                }

                if (update == MergeState.MERGING) {
                    try {
                        if (stats.getMergeInfo().isDelete()) {
                            deleteTablets(stats.getMergeInfo());
                        } else {
                            mergeMetadataRecords(stats.getMergeInfo());
                        }
                        this.master.setMergeState(stats.getMergeInfo(), update = MergeState.COMPLETE);
                    } catch (Exception ex) {
                        Master.log.error("Unable merge metadata table records", ex);
                    }
                }
            } catch (Exception ex) {
                Master.log.error("Unable to update merge state for merge " + stats.getMergeInfo().getExtent(), ex);
            }
        }
    }

    private void deleteTablets(MergeInfo info) throws AccumuloException {
        KeyExtent extent = info.getExtent();
        String targetSystemTable = extent.isMeta() ? RootTable.NAME : MetadataTable.NAME;
        Master.log.debug("Deleting tablets for " + extent);
        char timeType = '\0';
        KeyExtent followingTablet = null;
        if (extent.getEndRow() != null) {
            Key nextExtent = new Key(extent.getEndRow()).followingKey(PartialKey.ROW);
            followingTablet = getHighTablet(
                    new KeyExtent(extent.getTableId(), nextExtent.getRow(), extent.getEndRow()));
            Master.log.debug("Found following tablet " + followingTablet);
        }
        try {
            Connector conn = this.master.getConnector();
            Text start = extent.getPrevEndRow();
            if (start == null) {
                start = new Text();
            }
            Master.log.debug("Making file deletion entries for " + extent);
            Range deleteRange = new Range(KeyExtent.getMetadataEntry(extent.getTableId(), start), false,
                    KeyExtent.getMetadataEntry(extent.getTableId(), extent.getEndRow()), true);
            Scanner scanner = conn.createScanner(targetSystemTable, Authorizations.EMPTY);
            scanner.setRange(deleteRange);
            TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.fetch(scanner);
            TabletsSection.ServerColumnFamily.TIME_COLUMN.fetch(scanner);
            scanner.fetchColumnFamily(DataFileColumnFamily.NAME);
            scanner.fetchColumnFamily(TabletsSection.CurrentLocationColumnFamily.NAME);
            Set<FileRef> datafiles = new TreeSet<>();
            for (Entry<Key, Value> entry : scanner) {
                Key key = entry.getKey();
                if (key.compareColumnFamily(DataFileColumnFamily.NAME) == 0) {
                    datafiles.add(new FileRef(this.master.fs, key));
                    if (datafiles.size() > 1000) {
                        MetadataTableUtil.addDeleteEntries(extent, datafiles, master);
                        datafiles.clear();
                    }
                } else if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(key)) {
                    timeType = entry.getValue().toString().charAt(0);
                } else if (key.compareColumnFamily(TabletsSection.CurrentLocationColumnFamily.NAME) == 0) {
                    throw new IllegalStateException("Tablet " + key.getRow() + " is assigned during a merge!");
                } else if (TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) {
                    // ACCUMULO-2974 Need to include the TableID when converting a relative path to an absolute path.
                    // The value has the leading path separator already included so it doesn't need it included.
                    String path = entry.getValue().toString();
                    if (path.contains(":")) {
                        datafiles.add(new FileRef(path));
                    } else {
                        datafiles.add(new FileRef(path, this.master.fs.getFullPath(FileType.TABLE,
                                Path.SEPARATOR + extent.getTableId() + path)));
                    }
                    if (datafiles.size() > 1000) {
                        MetadataTableUtil.addDeleteEntries(extent, datafiles, master);
                        datafiles.clear();
                    }
                }
            }
            MetadataTableUtil.addDeleteEntries(extent, datafiles, master);
            BatchWriter bw = conn.createBatchWriter(targetSystemTable, new BatchWriterConfig());
            try {
                deleteTablets(info, deleteRange, bw, conn);
            } finally {
                bw.close();
            }

            if (followingTablet != null) {
                Master.log.debug("Updating prevRow of " + followingTablet + " to " + extent.getPrevEndRow());
                bw = conn.createBatchWriter(targetSystemTable, new BatchWriterConfig());
                try {
                    Mutation m = new Mutation(followingTablet.getMetadataEntry());
                    TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.put(m,
                            KeyExtent.encodePrevEndRow(extent.getPrevEndRow()));
                    ChoppedColumnFamily.CHOPPED_COLUMN.putDelete(m);
                    bw.addMutation(m);
                    bw.flush();
                } finally {
                    bw.close();
                }
            } else {
                // Recreate the default tablet to hold the end of the table
                Master.log.debug("Recreating the last tablet to point to " + extent.getPrevEndRow());
                String tdir = master.getFileSystem().choose(Optional.of(extent.getTableId()),
                        ServerConstants.getBaseUris()) + Constants.HDFS_TABLES_DIR + Path.SEPARATOR
                        + extent.getTableId() + Constants.DEFAULT_TABLET_LOCATION;
                MetadataTableUtil.addTablet(new KeyExtent(extent.getTableId(), null, extent.getPrevEndRow()), tdir,
                        master, timeType, this.master.masterLock);
            }
        } catch (RuntimeException | IOException | TableNotFoundException | AccumuloSecurityException ex) {
            throw new AccumuloException(ex);
        }
    }

    private void mergeMetadataRecords(MergeInfo info) throws AccumuloException {
        KeyExtent range = info.getExtent();
        Master.log.debug("Merging metadata for " + range);
        KeyExtent stop = getHighTablet(range);
        Master.log.debug("Highest tablet is " + stop);
        Value firstPrevRowValue = null;
        Text stopRow = stop.getMetadataEntry();
        Text start = range.getPrevEndRow();
        if (start == null) {
            start = new Text();
        }
        Range scanRange = new Range(KeyExtent.getMetadataEntry(range.getTableId(), start), false, stopRow, false);
        String targetSystemTable = MetadataTable.NAME;
        if (range.isMeta()) {
            targetSystemTable = RootTable.NAME;
        }

        BatchWriter bw = null;
        try {
            long fileCount = 0;
            Connector conn = this.master.getConnector();
            // Make file entries in highest tablet
            bw = conn.createBatchWriter(targetSystemTable, new BatchWriterConfig());
            Scanner scanner = conn.createScanner(targetSystemTable, Authorizations.EMPTY);
            scanner.setRange(scanRange);
            TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.fetch(scanner);
            TabletsSection.ServerColumnFamily.TIME_COLUMN.fetch(scanner);
            TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.fetch(scanner);
            scanner.fetchColumnFamily(DataFileColumnFamily.NAME);
            Mutation m = new Mutation(stopRow);
            String maxLogicalTime = null;
            for (Entry<Key, Value> entry : scanner) {
                Key key = entry.getKey();
                Value value = entry.getValue();
                if (key.getColumnFamily().equals(DataFileColumnFamily.NAME)) {
                    m.put(key.getColumnFamily(), key.getColumnQualifier(), value);
                    fileCount++;
                } else if (TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.hasColumns(key)
                        && firstPrevRowValue == null) {
                    Master.log.debug("prevRow entry for lowest tablet is " + value);
                    firstPrevRowValue = new Value(value);
                } else if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(key)) {
                    maxLogicalTime = TabletTime.maxMetadataTime(maxLogicalTime, value.toString());
                } else if (TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) {
                    bw.addMutation(MetadataTableUtil.createDeleteMutation(range.getTableId(),
                            entry.getValue().toString()));
                }
            }

            // read the logical time from the last tablet in the merge range, it is not included in
            // the loop above
            scanner = conn.createScanner(targetSystemTable, Authorizations.EMPTY);
            scanner.setRange(new Range(stopRow));
            TabletsSection.ServerColumnFamily.TIME_COLUMN.fetch(scanner);
            for (Entry<Key, Value> entry : scanner) {
                if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(entry.getKey())) {
                    maxLogicalTime = TabletTime.maxMetadataTime(maxLogicalTime, entry.getValue().toString());
                }
            }

            if (maxLogicalTime != null)
                TabletsSection.ServerColumnFamily.TIME_COLUMN.put(m, new Value(maxLogicalTime.getBytes()));

            if (!m.getUpdates().isEmpty()) {
                bw.addMutation(m);
            }

            bw.flush();

            Master.log.debug("Moved " + fileCount + " files to " + stop);

            if (firstPrevRowValue == null) {
                Master.log.debug("tablet already merged");
                return;
            }

            stop.setPrevEndRow(KeyExtent.decodePrevEndRow(firstPrevRowValue));
            Mutation updatePrevRow = stop.getPrevRowUpdateMutation();
            Master.log.debug("Setting the prevRow for last tablet: " + stop);
            bw.addMutation(updatePrevRow);
            bw.flush();

            deleteTablets(info, scanRange, bw, conn);

            // Clean-up the last chopped marker
            m = new Mutation(stopRow);
            ChoppedColumnFamily.CHOPPED_COLUMN.putDelete(m);
            bw.addMutation(m);
            bw.flush();

        } catch (Exception ex) {
            throw new AccumuloException(ex);
        } finally {
            if (bw != null)
                try {
                    bw.close();
                } catch (Exception ex) {
                    throw new AccumuloException(ex);
                }
        }
    }

    private void deleteTablets(MergeInfo info, Range scanRange, BatchWriter bw, Connector conn)
            throws TableNotFoundException, MutationsRejectedException {
        Scanner scanner;
        Mutation m;
        // Delete everything in the other tablets
        // group all deletes into tablet into one mutation, this makes tablets
        // either disappear entirely or not all.. this is important for the case
        // where the process terminates in the loop below...
        scanner = conn.createScanner(info.getExtent().isMeta() ? RootTable.NAME : MetadataTable.NAME,
                Authorizations.EMPTY);
        Master.log.debug("Deleting range " + scanRange);
        scanner.setRange(scanRange);
        RowIterator rowIter = new RowIterator(scanner);
        while (rowIter.hasNext()) {
            Iterator<Entry<Key, Value>> row = rowIter.next();
            m = null;
            while (row.hasNext()) {
                Entry<Key, Value> entry = row.next();
                Key key = entry.getKey();

                if (m == null)
                    m = new Mutation(key.getRow());

                m.putDelete(key.getColumnFamily(), key.getColumnQualifier());
                Master.log.debug("deleting entry " + key);
            }
            bw.addMutation(m);
        }

        bw.flush();
    }

    private KeyExtent getHighTablet(KeyExtent range) throws AccumuloException {
        try {
            Connector conn = this.master.getConnector();
            Scanner scanner = conn.createScanner(range.isMeta() ? RootTable.NAME : MetadataTable.NAME,
                    Authorizations.EMPTY);
            TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.fetch(scanner);
            KeyExtent start = new KeyExtent(range.getTableId(), range.getEndRow(), null);
            scanner.setRange(new Range(start.getMetadataEntry(), null));
            Iterator<Entry<Key, Value>> iterator = scanner.iterator();
            if (!iterator.hasNext()) {
                throw new AccumuloException("No last tablet for a merge " + range);
            }
            Entry<Key, Value> entry = iterator.next();
            KeyExtent highTablet = new KeyExtent(entry.getKey().getRow(),
                    KeyExtent.decodePrevEndRow(entry.getValue()));
            if (!highTablet.getTableId().equals(range.getTableId())) {
                throw new AccumuloException("No last tablet for merge " + range + " " + highTablet);
            }
            return highTablet;
        } catch (Exception ex) {
            throw new AccumuloException("Unexpected failure finding the last tablet for a merge " + range, ex);
        }
    }

    private void flushChanges(SortedMap<TServerInstance, TabletServerStatus> currentTServers,
            List<Assignment> assignments, List<Assignment> assigned,
            List<TabletLocationState> assignedToDeadServers, Map<TServerInstance, List<Path>> logsForDeadServers,
            List<TabletLocationState> suspendedToGoneServers, Map<KeyExtent, TServerInstance> unassigned)
            throws DistributedStoreException, TException, WalMarkerException {
        boolean tabletsSuspendable = canSuspendTablets();
        if (!assignedToDeadServers.isEmpty()) {
            int maxServersToShow = min(assignedToDeadServers.size(), 100);
            Master.log.debug(assignedToDeadServers.size() + " assigned to dead servers: "
                    + assignedToDeadServers.subList(0, maxServersToShow) + "...");
            Master.log.debug("logs for dead servers: " + logsForDeadServers);
            if (tabletsSuspendable) {
                store.suspend(assignedToDeadServers, logsForDeadServers, master.getSteadyTime());
            } else {
                store.unassign(assignedToDeadServers, logsForDeadServers);
            }
            this.master.markDeadServerLogsAsClosed(logsForDeadServers);
            this.master.nextEvent.event("Marked %d tablets as suspended because they don't have current servers",
                    assignedToDeadServers.size());
        }
        if (!suspendedToGoneServers.isEmpty()) {
            int maxServersToShow = min(assignedToDeadServers.size(), 100);
            Master.log.debug(assignedToDeadServers.size() + " suspended to gone servers: "
                    + assignedToDeadServers.subList(0, maxServersToShow) + "...");
            store.unsuspend(suspendedToGoneServers);
        }

        if (!currentTServers.isEmpty()) {
            Map<KeyExtent, TServerInstance> assignedOut = new HashMap<>();
            final StringBuilder builder = new StringBuilder(64);
            this.master.tabletBalancer.getAssignments(Collections.unmodifiableSortedMap(currentTServers),
                    Collections.unmodifiableMap(unassigned), assignedOut);
            for (Entry<KeyExtent, TServerInstance> assignment : assignedOut.entrySet()) {
                if (unassigned.containsKey(assignment.getKey())) {
                    if (assignment.getValue() != null) {
                        if (!currentTServers.containsKey(assignment.getValue())) {
                            Master.log.warn("balancer assigned " + assignment.getKey()
                                    + " to a tablet server that is not current " + assignment.getValue()
                                    + " ignoring");
                            continue;
                        }

                        if (builder.length() > 0) {
                            builder.append(ASSIGNMENT_BUFFER_SEPARATOR);
                        }

                        builder.append(assignment);

                        // Don't let the log message get too gigantic
                        if (builder.length() > ASSINGMENT_BUFFER_MAX_LENGTH) {
                            builder.append("]");
                            Master.log.debug(store.name() + " assigning tablets: [" + builder.toString());
                            builder.setLength(0);
                        }

                        assignments.add(new Assignment(assignment.getKey(), assignment.getValue()));
                    }
                } else {
                    Master.log.warn(
                            store.name() + " load balancer assigning tablet that was not nominated for assignment "
                                    + assignment.getKey());
                }
            }

            if (builder.length() > 0) {
                // Make sure to log any leftover assignments
                builder.append("]");
                Master.log.debug(store.name() + " assigning tablets: [" + builder.toString());
            }

            if (!unassigned.isEmpty() && assignedOut.isEmpty())
                Master.log.warn("Load balancer failed to assign any tablets");
        }

        if (assignments.size() > 0) {
            Master.log.info(String.format("Assigning %d tablets", assignments.size()));
            store.setFutureLocations(assignments);
        }
        assignments.addAll(assigned);
        for (Assignment a : assignments) {
            TServerConnection conn = this.master.tserverSet.getConnection(a.server);
            if (conn != null) {
                conn.assignTablet(this.master.masterLock, a.tablet);
            } else {
                Master.log.warn("Could not connect to server " + a.server);
            }
            master.assignedTablet(a.tablet);
        }
    }

}