org.apache.cassandra.db.ColumnFamilyStore.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.cassandra.db.ColumnFamilyStore.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.db;

import java.io.*;
import java.lang.management.ManagementFactory;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
import javax.management.MBeanServer;
import javax.management.ObjectName;

import com.google.common.collect.Iterables;
import org.apache.commons.collections.IteratorUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.cache.AutoSavingCache;
import org.apache.cassandra.cache.AutoSavingKeyCache;
import org.apache.cassandra.cache.AutoSavingRowCache;
import org.apache.cassandra.cache.ConcurrentLinkedHashCache;
import org.apache.cassandra.cache.ICache;
import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
import org.apache.cassandra.concurrent.NamedThreadFactory;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ColumnDefinition;
import org.apache.cassandra.config.ConfigurationException;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.columniterator.IColumnIterator;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.commitlog.ReplayPosition;
import org.apache.cassandra.db.filter.*;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.BytesType;
import org.apache.cassandra.db.marshal.LocalByPartionerType;
import org.apache.cassandra.dht.*;
import org.apache.cassandra.io.sstable.*;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.thrift.IndexClause;
import org.apache.cassandra.thrift.IndexExpression;
import org.apache.cassandra.thrift.IndexOperator;
import org.apache.cassandra.utils.*;
import org.cliffc.high_scale_lib.NonBlockingHashMap;

public class ColumnFamilyStore implements ColumnFamilyStoreMBean {
    private static Logger logger = LoggerFactory.getLogger(ColumnFamilyStore.class);

    /*
     * submitFlush first puts [Binary]Memtable.getSortedContents on the flushSorter executor,
     * which then puts the sorted results on the writer executor.  This is because sorting is CPU-bound,
     * and writing is disk-bound; we want to be able to do both at once.  When the write is complete,
     * we turn the writer into an SSTableReader and add it to ssTables_ where it is available for reads.
     *
     * For BinaryMemtable that's about all that happens.  For live Memtables there are two other things
     * that switchMemtable does (which should be the only caller of submitFlush in this case).
     * First, it puts the Memtable into memtablesPendingFlush, where it stays until the flush is complete
     * and it's been added as an SSTableReader to ssTables_.  Second, it adds an entry to commitLogUpdater
     * that waits for the flush to complete, then calls onMemtableFlush.  This allows multiple flushes
     * to happen simultaneously on multicore systems, while still calling onMF in the correct order,
     * which is necessary for replay in case of a restart since CommitLog assumes that when onMF is
     * called, all data up to the given context has been persisted to SSTables.
     */
    private static final ExecutorService flushSorter = new JMXEnabledThreadPoolExecutor(
            Runtime.getRuntime().availableProcessors(), StageManager.KEEPALIVE, TimeUnit.SECONDS,
            new LinkedBlockingQueue<Runnable>(Runtime.getRuntime().availableProcessors()),
            new NamedThreadFactory("FlushSorter"), "internal");
    private static final ExecutorService flushWriter = new JMXEnabledThreadPoolExecutor(
            DatabaseDescriptor.getFlushWriters(), StageManager.KEEPALIVE, TimeUnit.SECONDS,
            new LinkedBlockingQueue<Runnable>(DatabaseDescriptor.getFlushQueueSize()),
            new NamedThreadFactory("FlushWriter"), "internal");

    public static final ExecutorService postFlushExecutor = new JMXEnabledThreadPoolExecutor("MemtablePostFlusher");

    static {
        if (DatabaseDescriptor.estimatesRealMemtableSize()) {
            logger.info("Global memtable threshold is enabled at {}MB",
                    DatabaseDescriptor.getTotalMemtableSpaceInMB());
            // (can block if flush queue fills up, so don't put on scheduledTasks)
            StorageService.tasks.scheduleWithFixedDelay(new MeteredFlusher(), 1000, 1000, TimeUnit.MILLISECONDS);
        } else {
            logger.info("Global memtable threshold is disabled");
        }
    }

    public final Table table;
    public final String columnFamily;
    public final CFMetaData metadata;
    public final IPartitioner partitioner;
    private final String mbeanName;
    private boolean invalid = false;

    /* Memtables and SSTables on disk for this column family */
    private final DataTracker data;

    private volatile int memtableSwitchCount = 0;

    /* This is used to generate the next index for a SSTable */
    private AtomicInteger fileIndexGenerator = new AtomicInteger(0);

    private final ConcurrentSkipListMap<ByteBuffer, ColumnFamilyStore> indexedColumns;

    // TODO binarymemtable ops are not threadsafe (do they need to be?)
    private AtomicReference<BinaryMemtable> binaryMemtable;

    private LatencyTracker readStats = new LatencyTracker();
    private LatencyTracker writeStats = new LatencyTracker();

    // counts of sstables accessed by reads
    private final EstimatedHistogram recentSSTablesPerRead = new EstimatedHistogram(35);
    private final EstimatedHistogram sstablesPerRead = new EstimatedHistogram(35);

    private static final int INTERN_CUTOFF = 256;
    public final ConcurrentMap<ByteBuffer, ByteBuffer> internedNames = new NonBlockingHashMap<ByteBuffer, ByteBuffer>();

    /* These are locally held copies to be changed from the config during runtime */
    private volatile DefaultInteger minCompactionThreshold;
    private volatile DefaultInteger maxCompactionThreshold;
    private volatile DefaultInteger memtime;
    private volatile DefaultInteger memsize;
    private volatile DefaultDouble memops;
    private volatile DefaultInteger rowCacheSaveInSeconds;
    private volatile DefaultInteger keyCacheSaveInSeconds;

    /** Lock to allow migrations to block all flushing, so we can be sure not to write orphaned data files */
    public final Lock flushLock = new ReentrantLock();

    public static enum CacheType {
        KEY_CACHE_TYPE("KeyCache"), ROW_CACHE_TYPE("RowCache");

        public final String name;

        private CacheType(String name) {
            this.name = name;
        }

        @Override
        public String toString() {
            return name;
        }
    }

    public final AutoSavingCache<Pair<Descriptor, DecoratedKey>, Long> keyCache;
    public final AutoSavingCache<DecoratedKey, ColumnFamily> rowCache;

    /** ratio of in-memory memtable size, to serialized size */
    volatile double liveRatio = 1.0;
    /** ops count last time we computed liveRatio */
    private final AtomicLong liveRatioComputedAt = new AtomicLong(32);

    public void reload() {
        // metadata object has been mutated directly. make all the members jibe with new settings.

        // only update these runtime-modifiable settings if they have not been modified.
        if (!minCompactionThreshold.isModified())
            for (ColumnFamilyStore cfs : concatWithIndexes())
                cfs.minCompactionThreshold = new DefaultInteger(metadata.getMinCompactionThreshold());
        if (!maxCompactionThreshold.isModified())
            for (ColumnFamilyStore cfs : concatWithIndexes())
                cfs.maxCompactionThreshold = new DefaultInteger(metadata.getMaxCompactionThreshold());
        if (!memtime.isModified())
            for (ColumnFamilyStore cfs : concatWithIndexes())
                cfs.memtime = new DefaultInteger(metadata.getMemtableFlushAfterMins());
        if (!memsize.isModified())
            for (ColumnFamilyStore cfs : concatWithIndexes())
                cfs.memsize = new DefaultInteger(metadata.getMemtableThroughputInMb());
        if (!memops.isModified())
            for (ColumnFamilyStore cfs : concatWithIndexes())
                cfs.memops = new DefaultDouble(metadata.getMemtableOperationsInMillions());
        if (!rowCacheSaveInSeconds.isModified())
            rowCacheSaveInSeconds = new DefaultInteger(metadata.getRowCacheSavePeriodInSeconds());
        if (!keyCacheSaveInSeconds.isModified())
            keyCacheSaveInSeconds = new DefaultInteger(metadata.getKeyCacheSavePeriodInSeconds());

        updateCacheSizes();
        scheduleCacheSaving(rowCacheSaveInSeconds.value(), keyCacheSaveInSeconds.value());

        // figure out what needs to be added and dropped.
        // future: if/when we have modifiable settings for secondary indexes, they'll need to be handled here.
        for (ByteBuffer indexedColumn : indexedColumns.keySet()) {
            ColumnDefinition def = metadata.getColumn_metadata().get(indexedColumn);
            if (def == null || def.getIndexType() == null)
                removeIndex(indexedColumn);
        }

        for (ColumnDefinition cdef : metadata.getColumn_metadata().values())
            if (cdef.getIndexType() != null && !indexedColumns.containsKey(cdef.name))
                addIndex(cdef);
    }

    void removeIndex(ByteBuffer indexedColumn) {
        ColumnFamilyStore indexCfs = indexedColumns.remove(indexedColumn);
        if (indexCfs == null) {
            logger.debug("index {} already removed; ignoring", ByteBufferUtil.bytesToHex(indexedColumn));
            return;
        }
        indexCfs.unregisterMBean();
        SystemTable.setIndexRemoved(metadata.ksName, indexCfs.columnFamily);
        indexCfs.removeAllSSTables();
    }

    private ColumnFamilyStore(Table table, String columnFamilyName, IPartitioner partitioner, int generation,
            CFMetaData metadata) {
        assert metadata != null : "null metadata for " + table + ":" + columnFamilyName;
        this.table = table;
        columnFamily = columnFamilyName;
        this.metadata = metadata;
        this.minCompactionThreshold = new DefaultInteger(metadata.getMinCompactionThreshold());
        this.maxCompactionThreshold = new DefaultInteger(metadata.getMaxCompactionThreshold());
        this.memtime = new DefaultInteger(metadata.getMemtableFlushAfterMins());
        this.memsize = new DefaultInteger(metadata.getMemtableThroughputInMb());
        this.memops = new DefaultDouble(metadata.getMemtableOperationsInMillions());
        this.rowCacheSaveInSeconds = new DefaultInteger(metadata.getRowCacheSavePeriodInSeconds());
        this.keyCacheSaveInSeconds = new DefaultInteger(metadata.getKeyCacheSavePeriodInSeconds());
        this.partitioner = partitioner;
        fileIndexGenerator.set(generation);
        binaryMemtable = new AtomicReference<BinaryMemtable>(new BinaryMemtable(this));

        if (logger.isDebugEnabled())
            logger.debug("Starting CFS {}", columnFamily);

        ICache<Pair<Descriptor, DecoratedKey>, Long> kc = ConcurrentLinkedHashCache.create(0);
        keyCache = new AutoSavingKeyCache<Pair<Descriptor, DecoratedKey>, Long>(kc, table.name, columnFamilyName);
        ICache<DecoratedKey, ColumnFamily> rc = metadata.getRowCacheProvider().create(0);
        rowCache = new AutoSavingRowCache<DecoratedKey, ColumnFamily>(rc, table.name, columnFamilyName);

        // scan for sstables corresponding to this cf and load them
        data = new DataTracker(this);
        Set<DecoratedKey> savedKeys = keyCache.readSaved();
        List<SSTableReader> sstables = new ArrayList<SSTableReader>();
        for (Map.Entry<Descriptor, Set<Component>> sstableFiles : files(table.name, columnFamilyName, false)
                .entrySet()) {
            SSTableReader sstable;
            try {
                sstable = SSTableReader.open(sstableFiles.getKey(), sstableFiles.getValue(), savedKeys, data,
                        metadata, this.partitioner);
            } catch (FileNotFoundException ex) {
                logger.error(
                        "Missing sstable component in " + sstableFiles + "; skipped because of " + ex.getMessage());
                continue;
            } catch (IOException ex) {
                logger.error("Corrupt sstable " + sstableFiles + "; skipped", ex);
                continue;
            }
            sstables.add(sstable);
        }
        data.addSSTables(sstables);

        // create the private ColumnFamilyStores for the secondary column indexes
        indexedColumns = new ConcurrentSkipListMap<ByteBuffer, ColumnFamilyStore>(getComparator());
        for (ColumnDefinition info : metadata.getColumn_metadata().values()) {
            if (info.getIndexType() != null)
                addIndex(info);
        }

        // register the mbean
        String type = this.partitioner instanceof LocalPartitioner ? "IndexColumnFamilies" : "ColumnFamilies";
        mbeanName = "org.apache.cassandra.db:type=" + type + ",keyspace=" + this.table.name + ",columnfamily="
                + columnFamily;
        try {
            MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
            ObjectName nameObj = new ObjectName(mbeanName);
            mbs.registerMBean(this, nameObj);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public Future<?> addIndex(final ColumnDefinition info) {
        assert info.getIndexType() != null;

        // create the index CFS
        IPartitioner rowPartitioner = StorageService.getPartitioner();
        AbstractType columnComparator = (rowPartitioner instanceof OrderPreservingPartitioner
                || rowPartitioner instanceof ByteOrderedPartitioner) ? BytesType.instance
                        : new LocalByPartionerType(StorageService.getPartitioner());
        final CFMetaData indexedCfMetadata = CFMetaData.newIndexMetadata(metadata, info, columnComparator);
        ColumnFamilyStore indexedCfs = ColumnFamilyStore.createColumnFamilyStore(table, indexedCfMetadata.cfName,
                new LocalPartitioner(metadata.getColumn_metadata().get(info.name).getValidator()),
                indexedCfMetadata);

        // link in indexedColumns.  this means that writes will add new data to the index immediately,
        // so we don't have to lock everything while we do the build.  it's up to the operator to wait
        // until the index is actually built before using in queries.
        if (indexedColumns.putIfAbsent(info.name, indexedCfs) != null)
            return null;

        // if we're just linking in the index to indexedColumns on an already-built index post-restart, we're done
        if (indexedCfs.isIndexBuilt())
            return null;

        // build it asynchronously; addIndex gets called by CFS open and schema update, neither of which
        // we want to block for a long period.  (actual build is serialized on CompactionManager.)
        Runnable runnable = new Runnable() {
            public void run() {
                try {
                    forceBlockingFlush();
                } catch (ExecutionException e) {
                    throw new RuntimeException(e);
                } catch (InterruptedException e) {
                    throw new AssertionError(e);
                }
                buildSecondaryIndexes(getSSTables(), FBUtilities.singleton(info.name));
                SystemTable.setIndexBuilt(table.name, indexedCfMetadata.cfName);
            }
        };
        FutureTask<?> f = new FutureTask<Object>(runnable, null);
        new Thread(f, "Create index " + indexedCfMetadata.cfName).start();
        return f;
    }

    public void buildSecondaryIndexes(Collection<SSTableReader> sstables, SortedSet<ByteBuffer> columns) {
        logger.info(String.format("Submitting index build of %s for data in %s",
                metadata.comparator.getString(columns), StringUtils.join(sstables, ", ")));
        Table.IndexBuilder builder = table.createIndexBuilder(this, columns, new ReducingKeyIterator(sstables));
        Future future = CompactionManager.instance.submitIndexBuild(this, builder);
        try {
            future.get();
            for (ByteBuffer column : columns)
                getIndexedColumnFamilyStore(column).forceBlockingFlush();
        } catch (InterruptedException e) {
            throw new AssertionError(e);
        } catch (ExecutionException e) {
            throw new RuntimeException(e);
        }
        logger.info("Index build of " + metadata.comparator.getString(columns) + " complete");
    }

    // called when dropping or renaming a CF. Performs mbean housekeeping and invalidates CFS to other operations.
    void unregisterMBean() {
        try {
            invalid = true;
            MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
            ObjectName nameObj = new ObjectName(mbeanName);
            if (mbs.isRegistered(nameObj))
                mbs.unregisterMBean(nameObj);
            for (ColumnFamilyStore index : indexedColumns.values())
                index.unregisterMBean();
        } catch (Exception e) {
            // this shouldn't block anything.
            logger.warn(e.getMessage(), e);
        }
    }

    public long getMinRowSize() {
        return data.getMinRowSize();
    }

    public long getMaxRowSize() {
        return data.getMaxRowSize();
    }

    public long getMeanRowSize() {
        return data.getMeanRowSize();
    }

    public int getMeanColumns() {
        return data.getMeanColumns();
    }

    public static ColumnFamilyStore createColumnFamilyStore(Table table, String columnFamily) {
        return createColumnFamilyStore(table, columnFamily, StorageService.getPartitioner(),
                DatabaseDescriptor.getCFMetaData(table.name, columnFamily));
    }

    public static synchronized ColumnFamilyStore createColumnFamilyStore(Table table, String columnFamily,
            IPartitioner partitioner, CFMetaData metadata) {
        // get the max generation number, to prevent generation conflicts
        List<Integer> generations = new ArrayList<Integer>();
        for (Descriptor desc : files(table.name, columnFamily, true).keySet()) {
            generations.add(desc.generation);
            if (desc.isFromTheFuture()) {
                throw new RuntimeException(
                        String.format("Can't open sstables from the future! Current version %s, found file: %s",
                                Descriptor.CURRENT_VERSION, desc));
            }
        }
        Collections.sort(generations);
        int value = (generations.size() > 0) ? (generations.get(generations.size() - 1)) : 0;

        return new ColumnFamilyStore(table, columnFamily, partitioner, value, metadata);
    }

    /**
     * Removes unnecessary files from the cf directory at startup: these include temp files, orphans, zero-length files
     * and compacted sstables. Files that cannot be recognized will be ignored.
     * @return A list of Descriptors that were removed.
     */
    public static void scrubDataDirectories(String table, String columnFamily) {
        for (Map.Entry<Descriptor, Set<Component>> sstableFiles : files(table, columnFamily, true).entrySet()) {
            Descriptor desc = sstableFiles.getKey();
            Set<Component> components = sstableFiles.getValue();

            if (components.contains(Component.COMPACTED_MARKER) || desc.temporary) {
                SSTable.delete(desc, components);
                continue;
            }

            File dataFile = new File(desc.filenameFor(Component.DATA));
            if (components.contains(Component.DATA) && dataFile.length() > 0)
                // everything appears to be in order... moving on.
                continue;

            // missing the DATA file! all components are orphaned
            logger.warn("Removing orphans for {}: {}", desc, components);
            for (Component component : components) {
                try {
                    FileUtils.deleteWithConfirm(desc.filenameFor(component));
                } catch (IOException e) {
                    throw new IOError(e);
                }
            }
        }

        // cleanup incomplete saved caches
        Pattern tmpCacheFilePattern = Pattern.compile(table + "-" + columnFamily + "-(Key|Row)Cache.*\\.tmp$");
        File dir = new File(DatabaseDescriptor.getSavedCachesLocation());

        if (dir.exists()) {
            assert dir.isDirectory();
            for (File file : dir.listFiles())
                if (tmpCacheFilePattern.matcher(file.getName()).matches())
                    if (!file.delete())
                        logger.warn("could not delete " + file.getAbsolutePath());
        }

        // also clean out any index leftovers.
        CFMetaData cfm = DatabaseDescriptor.getCFMetaData(table, columnFamily);
        if (cfm != null) // secondary indexes aren't stored in DD.
        {
            for (ColumnDefinition def : cfm.getColumn_metadata().values())
                scrubDataDirectories(table, CFMetaData.indexName(cfm.cfName, def));
        }
    }

    // must be called after all sstables are loaded since row cache merges all row versions
    public void initCaches() {
        long start = System.currentTimeMillis();
        // results are sorted on read (via treeset) because there are few reads and many writes and reads only happen at startup
        for (DecoratedKey key : rowCache.readSaved())
            cacheRow(key);
        if (rowCache.size() > 0)
            logger.info(String.format("completed loading (%d ms; %d keys) row cache for %s.%s",
                    System.currentTimeMillis() - start, rowCache.size(), table.name, columnFamily));

        scheduleCacheSaving(metadata.getRowCacheSavePeriodInSeconds(), metadata.getKeyCacheSavePeriodInSeconds());
    }

    public void scheduleCacheSaving(int rowCacheSavePeriodInSeconds, int keyCacheSavePeriodInSeconds) {
        keyCache.scheduleSaving(keyCacheSavePeriodInSeconds);
        rowCache.scheduleSaving(rowCacheSavePeriodInSeconds);
    }

    public AutoSavingCache<Pair<Descriptor, DecoratedKey>, Long> getKeyCache() {
        return keyCache;
    }

    /**
     * Collects a map of sstable components.
     */
    private static Map<Descriptor, Set<Component>> files(String keyspace, final String columnFamily,
            final boolean includeCompacted) {
        final Map<Descriptor, Set<Component>> sstables = new HashMap<Descriptor, Set<Component>>();
        for (String directory : DatabaseDescriptor.getAllDataFileLocationsForTable(keyspace)) {
            // NB: we never "accept" a file in the FilenameFilter sense: they are added to the sstable map
            new File(directory).list(new FilenameFilter() {
                public boolean accept(File dir, String name) {
                    Pair<Descriptor, Component> component = SSTable.tryComponentFromFilename(dir, name);
                    if (component != null && component.left.cfname.equals(columnFamily)) {
                        if (includeCompacted
                                || !new File(component.left.filenameFor(Component.COMPACTED_MARKER)).exists()) {
                            Set<Component> components = sstables.get(component.left);
                            if (components == null) {
                                components = new HashSet<Component>();
                                sstables.put(component.left, components);
                            }
                            components.add(component.right);
                        } else
                            logger.debug("not including compacted sstable " + component.left.cfname + "-"
                                    + component.left.generation);
                    }
                    return false;
                }
            });
        }
        return sstables;
    }

    /**
     * @return the name of the column family
     */
    public String getColumnFamilyName() {
        return columnFamily;
    }

    /*
     * @return a temporary file name for an sstable.
     * When the sstable object is closed, it will be renamed to a non-temporary
     * format, so incomplete sstables can be recognized and removed on startup.
     */
    public String getFlushPath(long estimatedSize, String version) {
        String location = table.getDataFileLocation(estimatedSize);
        if (location == null)
            throw new RuntimeException("Insufficient disk space to flush " + estimatedSize + " bytes");
        return getTempSSTablePath(location, version);
    }

    public String getTempSSTablePath(String directory, String version) {
        Descriptor desc = new Descriptor(version, new File(directory), table.name, columnFamily,
                fileIndexGenerator.incrementAndGet(), true);
        return desc.filenameFor(Component.DATA);
    }

    public String getTempSSTablePath(String directory) {
        return getTempSSTablePath(directory, Descriptor.CURRENT_VERSION);
    }

    /** flush the given memtable and swap in a new one for its CFS, if it hasn't been frozen already.  threadsafe. */
    Future<?> maybeSwitchMemtable(Memtable oldMemtable, final boolean writeCommitLog) {
        if (oldMemtable.isFrozen()) {
            logger.debug("memtable is already frozen; another thread must be flushing it");
            return null;
        }

        /*
         * If we can get the writelock, that means no new updates can come in and
         * all ongoing updates to memtables have completed. We can get the tail
         * of the log and use it as the starting position for log replay on recovery.
         *
         * This is why we Table.flusherLock needs to be global instead of per-Table:
         * we need to schedule discardCompletedSegments calls in the same order as their
         * contexts (commitlog position) were read, even though the flush executor
         * is multithreaded.
         */
        Table.switchLock.writeLock().lock();
        try {
            if (oldMemtable.isFrozen()) {
                logger.debug("memtable is already frozen; another thread must be flushing it");
                return null;
            }

            assert getMemtableThreadSafe() == oldMemtable;
            oldMemtable.freeze();
            final ReplayPosition ctx = writeCommitLog ? CommitLog.instance.getContext() : null;

            // submit the memtable for any indexed sub-cfses, and our own.
            List<ColumnFamilyStore> icc = new ArrayList<ColumnFamilyStore>(indexedColumns.size());
            // don't assume that this.memtable is dirty; forceFlush can bring us here during index build even if it is not
            for (ColumnFamilyStore cfs : concatWithIndexes()) {
                if (!cfs.getMemtableThreadSafe().isClean())
                    icc.add(cfs);
            }
            final CountDownLatch latch = new CountDownLatch(icc.size());
            for (ColumnFamilyStore cfs : icc)
                submitFlush(cfs.data.switchMemtable(), latch, ctx);

            // we marked our memtable as frozen as part of the concurrency control,
            // so even if there was nothing to flush we need to switch it out
            if (!icc.contains(this))
                data.renewMemtable();

            if (memtableSwitchCount == Integer.MAX_VALUE)
                memtableSwitchCount = 0;
            memtableSwitchCount++;

            // when all the memtables have been written, including for indexes, mark the flush in the commitlog header.
            // a second executor makes sure the onMemtableFlushes get called in the right order,
            // while keeping the wait-for-flush (future.get) out of anything latency-sensitive.
            return postFlushExecutor.submit(new WrappedRunnable() {
                public void runMayThrow() throws InterruptedException, IOException {
                    latch.await();
                    if (writeCommitLog) {
                        // if we're not writing to the commit log, we are replaying the log, so marking
                        // the log header with "you can discard anything written before the context" is not valid
                        CommitLog.instance.discardCompletedSegments(metadata.cfId, ctx);
                    }
                }
            });
        } finally {
            Table.switchLock.writeLock().unlock();
        }
    }

    public boolean isDropped() {
        return isIndex() ? DatabaseDescriptor.getCFMetaData(table.name, getParentColumnfamily()) == null
                : DatabaseDescriptor.getCFMetaData(metadata.cfId) == null;
    }

    void switchBinaryMemtable(DecoratedKey key, ByteBuffer buffer) {
        binaryMemtable.set(new BinaryMemtable(this));
        binaryMemtable.get().put(key, buffer);
    }

    public void forceFlushIfExpired() {
        if (getMemtableThreadSafe().isExpired())
            forceFlush();
    }

    public Future<?> forceFlush() {
        // during index build, 2ary index memtables can be dirty even if parent is not.  if so,
        // we want flushLargestMemtables to flush the 2ary index ones too.
        boolean clean = true;
        for (ColumnFamilyStore cfs : concatWithIndexes())
            clean &= cfs.getMemtableThreadSafe().isClean();

        if (clean) {
            logger.debug("forceFlush requested but everything is clean");
            return null;
        }

        return maybeSwitchMemtable(getMemtableThreadSafe(), true);
    }

    public void forceBlockingFlush() throws ExecutionException, InterruptedException {
        Future<?> future = forceFlush();
        if (future != null)
            future.get();
    }

    public void forceFlushBinary() {
        if (binaryMemtable.get().isClean())
            return;

        submitFlush(binaryMemtable.get(), new CountDownLatch(1), null);
    }

    public void updateRowCache(DecoratedKey key, ColumnFamily columnFamily) {
        if (rowCache.isPutCopying()) {
            invalidateCachedRow(key);
        } else {
            ColumnFamily cachedRow = getRawCachedRow(key);
            if (cachedRow != null)
                cachedRow.addAll(columnFamily);
        }
    }

    /**
     * Insert/Update the column family for this key.
     * Caller is responsible for acquiring Table.flusherLock!
     * param @ lock - lock that needs to be used.
     * param @ key - key for update/insert
     * param @ columnFamily - columnFamily changes
     */
    Memtable apply(DecoratedKey key, ColumnFamily columnFamily) {
        long start = System.nanoTime();

        Memtable mt = getMemtableThreadSafe();
        boolean flushRequested = mt.isThresholdViolated();
        mt.put(key, columnFamily);
        updateRowCache(key, columnFamily);
        writeStats.addNano(System.nanoTime() - start);

        if (DatabaseDescriptor.estimatesRealMemtableSize()) {
            while (true) {
                long last = liveRatioComputedAt.get();
                long operations = writeStats.getOpCount();
                if (operations < 2 * last)
                    break;
                if (liveRatioComputedAt.compareAndSet(last, operations)) {
                    logger.debug("computing liveRatio of {} at {} ops", this, operations);
                    mt.updateLiveRatio();
                }
            }
        }

        return flushRequested ? mt : null;
    }

    /*
     * Insert/Update the column family for this key. param @ lock - lock that
     * Caller is responsible for acquiring Table.flusherLock!
     * param @ lock - lock that needs to be used.
     * needs to be used. param @ key - key for update/insert param @
     * columnFamily - columnFamily changes
     */
    void applyBinary(DecoratedKey key, ByteBuffer buffer) {
        long start = System.nanoTime();
        binaryMemtable.get().put(key, buffer);
        writeStats.addNano(System.nanoTime() - start);
    }

    public static ColumnFamily removeDeletedCF(ColumnFamily cf, int gcBefore) {
        // in case of a timestamp tie, tombstones get priority over non-tombstones.
        // (we want this to be deterministic to avoid confusion.)
        if (cf.getColumnCount() == 0 && cf.getLocalDeletionTime() <= gcBefore)
            return null;
        return cf;
    }

    /*
     This is complicated because we need to preserve deleted columns, supercolumns, and columnfamilies
     until they have been deleted for at least GC_GRACE_IN_SECONDS.  But, we do not need to preserve
     their contents; just the object itself as a "tombstone" that can be used to repair other
     replicas that do not know about the deletion.
     */
    public static ColumnFamily removeDeleted(ColumnFamily cf, int gcBefore) {
        if (cf == null) {
            return null;
        }

        removeDeletedColumnsOnly(cf, gcBefore);
        return removeDeletedCF(cf, gcBefore);
    }

    private static void removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore) {
        if (cf.isSuper())
            removeDeletedSuper(cf, gcBefore);
        else
            removeDeletedStandard(cf, gcBefore);
    }

    private static void removeDeletedStandard(ColumnFamily cf, int gcBefore) {
        for (Map.Entry<ByteBuffer, IColumn> entry : cf.getColumnsMap().entrySet()) {
            ByteBuffer cname = entry.getKey();
            IColumn c = entry.getValue();
            // remove columns if
            // (a) the column itself is tombstoned or
            // (b) the CF is tombstoned and the column is not newer than it
            if ((c.isMarkedForDelete() && c.getLocalDeletionTime() <= gcBefore)
                    || c.timestamp() <= cf.getMarkedForDeleteAt()) {
                cf.remove(cname);
            }
        }
    }

    private static void removeDeletedSuper(ColumnFamily cf, int gcBefore) {
        // TODO assume deletion means "most are deleted?" and add to clone, instead of remove from original?
        // this could be improved by having compaction, or possibly even removeDeleted, r/m the tombstone
        // once gcBefore has passed, so if new stuff is added in it doesn't used the wrong algorithm forever
        for (Map.Entry<ByteBuffer, IColumn> entry : cf.getColumnsMap().entrySet()) {
            SuperColumn c = (SuperColumn) entry.getValue();
            long minTimestamp = Math.max(c.getMarkedForDeleteAt(), cf.getMarkedForDeleteAt());
            for (IColumn subColumn : c.getSubColumns()) {
                // remove subcolumns if
                // (a) the subcolumn itself is tombstoned or
                // (b) the supercolumn is tombstoned and the subcolumn is not newer than it
                if (subColumn.timestamp() <= minTimestamp
                        || (subColumn.isMarkedForDelete() && subColumn.getLocalDeletionTime() <= gcBefore)) {
                    c.remove(subColumn.name());
                }
            }
            if (c.getSubColumns().isEmpty() && c.getLocalDeletionTime() <= gcBefore) {
                cf.remove(c.name());
            }
        }
    }

    /**
     * Uses bloom filters to check if key may be present in any sstable in this
     * ColumnFamilyStore, minus a set of provided ones.
     *
     * Because BFs are checked, negative returns ensure that the key is not
     * present in the checked SSTables, but positive ones doesn't ensure key
     * presence.
     */
    public boolean isKeyInRemainingSSTables(DecoratedKey key, Set<? extends SSTable> sstablesToIgnore) {
        for (SSTableReader sstable : data.getSSTables()) {
            if (!sstablesToIgnore.contains(sstable) && sstable.getBloomFilter().isPresent(key.key))
                return true;
        }
        return false;
    }

    /*
     * Called after a BinaryMemtable flushes its in-memory data, or we add a file
     * via bootstrap. This information is cached in the ColumnFamilyStore.
     * This is useful for reads because the ColumnFamilyStore first looks in
     * the in-memory store and the into the disk to find the key. If invoked
     * during recoveryMode the onMemtableFlush() need not be invoked.
     *
     * param @ filename - filename just flushed to disk
     */
    public void addSSTable(SSTableReader sstable) {
        assert sstable.getColumnFamilyName().equals(columnFamily);
        data.addStreamedSSTable(sstable);
        CompactionManager.instance.submitMinorIfNeeded(this);
    }

    /*
     * Add up all the files sizes this is the worst case file
     * size for compaction of all the list of files given.
     */
    long getExpectedCompactedFileSize(Iterable<SSTableReader> sstables) {
        long expectedFileSize = 0;
        for (SSTableReader sstable : sstables) {
            long size = sstable.length();
            expectedFileSize = expectedFileSize + size;
        }
        return expectedFileSize;
    }

    /*
     *  Find the maximum size file in the list .
     */
    SSTableReader getMaxSizeFile(Iterable<SSTableReader> sstables) {
        long maxSize = 0L;
        SSTableReader maxFile = null;
        for (SSTableReader sstable : sstables) {
            if (sstable.length() > maxSize) {
                maxSize = sstable.length();
                maxFile = sstable;
            }
        }
        return maxFile;
    }

    public void forceCleanup(NodeId.OneShotRenewer renewer) throws ExecutionException, InterruptedException {
        CompactionManager.instance.performCleanup(ColumnFamilyStore.this, renewer);
    }

    public void scrub() throws ExecutionException, InterruptedException {
        snapshotWithoutFlush("pre-scrub-" + System.currentTimeMillis());
        CompactionManager.instance.performScrub(ColumnFamilyStore.this);
    }

    void markCompacted(Collection<SSTableReader> sstables) {
        data.markCompacted(sstables);
    }

    boolean isCompleteSSTables(Collection<SSTableReader> sstables) {
        return data.getSSTables().equals(new HashSet<SSTableReader>(sstables));
    }

    void replaceCompactedSSTables(Collection<SSTableReader> sstables, Iterable<SSTableReader> replacements) {
        data.replaceCompactedSSTables(sstables, replacements);
    }

    void replaceFlushed(Memtable memtable, SSTableReader sstable) {
        data.replaceFlushed(memtable, sstable);
        CompactionManager.instance.submitMinorIfNeeded(this);
    }

    public boolean isInvalid() {
        return invalid;
    }

    public void removeAllSSTables() {
        data.removeAllSSTables();
        for (ColumnFamilyStore indexedCfs : indexedColumns.values()) {
            indexedCfs.removeAllSSTables();
        }
    }

    /**
     * submits flush sort on the flushSorter executor, which will in turn submit to flushWriter when sorted.
     * TODO because our executors use CallerRunsPolicy, when flushSorter fills up, no writes will proceed
     * because the next flush will start executing on the caller, mutation-stage thread that has the
     * flush write lock held.  (writes aquire this as a read lock before proceeding.)
     * This is good, because it backpressures flushes, but bad, because we can't write until that last
     * flushing thread finishes sorting, which will almost always be longer than any of the flushSorter threads proper
     * (since, by definition, it started last).
     */
    void submitFlush(IFlushable flushable, CountDownLatch latch, ReplayPosition context) {
        logger.info("Enqueuing flush of {}", flushable);
        flushable.flushAndSignal(latch, flushSorter, flushWriter, context);
    }

    public long getMemtableColumnsCount() {
        return getMemtableThreadSafe().getOperations();
    }

    public long getMemtableDataSize() {
        return getMemtableThreadSafe().getLiveSize();
    }

    public long getTotalMemtableLiveSize() {
        long total = 0;
        for (ColumnFamilyStore cfs : concatWithIndexes())
            total += cfs.getMemtableThreadSafe().getLiveSize();
        return total;
    }

    public int getMemtableSwitchCount() {
        return memtableSwitchCount;
    }

    /**
     * get the current memtable in a threadsafe fashion.  note that simply "return memtable_" is
     * incorrect; you need to lock to introduce a thread safe happens-before ordering.
     *
     * do NOT use this method to do either a put or get on the memtable object, since it could be
     * flushed in the meantime (and its executor terminated).
     *
     * also do NOT make this method public or it will really get impossible to reason about these things.
     * @return
     */
    private Memtable getMemtableThreadSafe() {
        return data.getMemtable();
    }

    /**
     * Package protected for access from the CompactionManager.
     */
    DataTracker getDataTracker() {
        return data;
    }

    public Collection<SSTableReader> getSSTables() {
        return data.getSSTables();
    }

    public long[] getRecentSSTablesPerReadHistogram() {
        return recentSSTablesPerRead.getBuckets(true);
    }

    public long[] getSSTablesPerReadHistogram() {
        return sstablesPerRead.getBuckets(false);
    }

    public long getReadCount() {
        return readStats.getOpCount();
    }

    public double getRecentReadLatencyMicros() {
        return readStats.getRecentLatencyMicros();
    }

    public long[] getLifetimeReadLatencyHistogramMicros() {
        return readStats.getTotalLatencyHistogramMicros();
    }

    public long[] getRecentReadLatencyHistogramMicros() {
        return readStats.getRecentLatencyHistogramMicros();
    }

    public long getTotalReadLatencyMicros() {
        return readStats.getTotalLatencyMicros();
    }

    // TODO this actually isn't a good meature of pending tasks
    public int getPendingTasks() {
        return Table.switchLock.getQueueLength();
    }

    public long getWriteCount() {
        return writeStats.getOpCount();
    }

    public long getTotalWriteLatencyMicros() {
        return writeStats.getTotalLatencyMicros();
    }

    public double getRecentWriteLatencyMicros() {
        return writeStats.getRecentLatencyMicros();
    }

    public long[] getLifetimeWriteLatencyHistogramMicros() {
        return writeStats.getTotalLatencyHistogramMicros();
    }

    public long[] getRecentWriteLatencyHistogramMicros() {
        return writeStats.getRecentLatencyHistogramMicros();
    }

    public ColumnFamily getColumnFamily(DecoratedKey key, QueryPath path, ByteBuffer start, ByteBuffer finish,
            boolean reversed, int limit) {
        return getColumnFamily(QueryFilter.getSliceFilter(key, path, start, finish, reversed, limit));
    }

    /**
     * get a list of columns starting from a given column, in a specified order.
     * only the latest version of a column is returned.
     * @return null if there is no data and no tombstones; otherwise a ColumnFamily
     */
    public ColumnFamily getColumnFamily(QueryFilter filter) {
        return getColumnFamily(filter, gcBefore());
    }

    public int gcBefore() {
        return (int) (System.currentTimeMillis() / 1000) - metadata.getGcGraceSeconds();
    }

    private ColumnFamily cacheRow(DecoratedKey key) {
        ColumnFamily cached;
        if ((cached = rowCache.get(key)) == null) {
            cached = getTopLevelColumns(QueryFilter.getIdentityFilter(key, new QueryPath(columnFamily)),
                    Integer.MIN_VALUE);
            if (cached == null)
                return null;

            if (!rowCache.isPutCopying()) {
                // make a deep copy of column data so we don't keep references to direct buffers, which
                // would prevent munmap post-compaction.
                for (IColumn column : cached.getSortedColumns()) {
                    cached.remove(column.name());
                    cached.addColumn(column.localCopy(this));
                }
            }

            // avoid keeping a permanent reference to the original key buffer
            rowCache.put(new DecoratedKey(key.token, ByteBufferUtil.clone(key.key)), cached);
        }
        return cached;
    }

    private ColumnFamily getColumnFamily(QueryFilter filter, int gcBefore) {
        assert columnFamily.equals(filter.getColumnFamilyName()) : filter.getColumnFamilyName();

        long start = System.nanoTime();
        try {
            if (rowCache.getCapacity() == 0) {
                ColumnFamily cf = getTopLevelColumns(filter, gcBefore);

                // TODO this is necessary because when we collate supercolumns together, we don't check
                // their subcolumns for relevance, so we need to do a second prune post facto here.
                return cf.isSuper() ? removeDeleted(cf, gcBefore) : removeDeletedCF(cf, gcBefore);
            }

            ColumnFamily cached = cacheRow(filter.key);
            if (cached == null)
                return null;

            return filterColumnFamily(cached, filter, gcBefore);
        } finally {
            readStats.addNano(System.nanoTime() - start);
        }
    }

    /** filter a cached row, which will not be modified by the filter, but may be modified by throwing out
     *  tombstones that are no longer relevant. */
    ColumnFamily filterColumnFamily(ColumnFamily cached, QueryFilter filter, int gcBefore) {
        // special case slicing the entire row:
        // we can skip the filter step entirely, and we can help out removeDeleted by re-caching the result
        // if any tombstones have aged out since last time.  (This means that the row cache will treat gcBefore as
        // max(gcBefore, all previous gcBefore), which is fine for correctness.)
        //
        // But, if the filter is asking for less columns than we have cached, we fall back to the slow path
        // since we have to copy out a subset.
        if (filter.filter instanceof SliceQueryFilter) {
            SliceQueryFilter sliceFilter = (SliceQueryFilter) filter.filter;
            if (sliceFilter.start.remaining() == 0 && sliceFilter.finish.remaining() == 0) {
                if (cached.isSuper() && filter.path.superColumnName != null) {
                    // subcolumns from named supercolumn
                    IColumn sc = cached.getColumn(filter.path.superColumnName);
                    if (sc == null || sliceFilter.count >= sc.getSubColumns().size()) {
                        ColumnFamily cf = cached.cloneMeShallow();
                        if (sc != null)
                            cf.addColumn(sc);
                        return removeDeleted(cf, gcBefore);
                    }
                } else {
                    // top-level columns
                    if (sliceFilter.count >= cached.getColumnCount()) {
                        removeDeletedColumnsOnly(cached, gcBefore);
                        return removeDeletedCF(cached, gcBefore);
                    }
                }
            }
        }

        IColumnIterator ci = filter.getMemtableColumnIterator(cached, null, getComparator());
        ColumnFamily cf = ci.getColumnFamily().cloneMeShallow();
        filter.collectCollatedColumns(cf, ci, gcBefore);
        // TODO this is necessary because when we collate supercolumns together, we don't check
        // their subcolumns for relevance, so we need to do a second prune post facto here.
        return cf.isSuper() ? removeDeleted(cf, gcBefore) : removeDeletedCF(cf, gcBefore);
    }

    private ColumnFamily getTopLevelColumns(QueryFilter filter, int gcBefore) {
        // we are querying top-level columns, do a merging fetch with indexes.
        List<IColumnIterator> iterators = new ArrayList<IColumnIterator>();
        final ColumnFamily returnCF = ColumnFamily.create(metadata);
        try {
            IColumnIterator iter;
            int sstablesToIterate = 0;
            DataTracker.View currentView = data.getView();

            /* add the current memtable */
            iter = filter.getMemtableColumnIterator(currentView.memtable, getComparator());
            if (iter != null) {
                returnCF.delete(iter.getColumnFamily());
                iterators.add(iter);
            }

            /* add the memtables being flushed */
            for (Memtable memtable : currentView.memtablesPendingFlush) {
                iter = filter.getMemtableColumnIterator(memtable, getComparator());
                if (iter != null) {
                    returnCF.delete(iter.getColumnFamily());
                    iterators.add(iter);
                }
            }

            /* add the SSTables on disk */
            for (SSTableReader sstable : currentView.sstables) {
                iter = filter.getSSTableColumnIterator(sstable);
                if (iter.getColumnFamily() != null) {
                    returnCF.delete(iter.getColumnFamily());
                    iterators.add(iter);
                    sstablesToIterate++;
                }
            }

            recentSSTablesPerRead.add(sstablesToIterate);
            sstablesPerRead.add(sstablesToIterate);

            Comparator<IColumn> comparator = filter.filter.getColumnComparator(getComparator());
            Iterator collated = IteratorUtils.collatedIterator(comparator, iterators);

            filter.collectCollatedColumns(returnCF, collated, gcBefore);

            // Caller is responsible for final removeDeletedCF.  This is important for cacheRow to work correctly:
            // we need to distinguish between "there is no data at all for this row" (BF will let us rebuild that efficiently)
            // and "there used to be data, but it's gone now" (we should cache the empty CF so we don't need to rebuild that slower)
            return returnCF;
        } finally {
            /* close all cursors */
            for (IColumnIterator ci : iterators) {
                try {
                    ci.close();
                } catch (Throwable th) {
                    logger.error("error closing " + ci, th);
                }
            }
        }
    }

    /**
      * Fetch a range of rows and columns from memtables/sstables.
      * 
      * @param superColumn optional SuperColumn to slice subcolumns of; null to slice top-level columns
      * @param range Either a Bounds, which includes start key, or a Range, which does not.
      * @param maxResults Maximum rows to return
      * @param columnFilter description of the columns we're interested in for each row
      * @return true if we found all keys we were looking for, otherwise false
     */
    public List<Row> getRangeSlice(ByteBuffer superColumn, final AbstractBounds range, int maxResults,
            IFilter columnFilter) throws ExecutionException, InterruptedException {
        assert range instanceof Bounds || (!((Range) range).isWrapAround()
                || range.right.equals(StorageService.getPartitioner().getMinimumToken())) : range;

        DecoratedKey startWith = new DecoratedKey(range.left, null);
        DecoratedKey stopAt = new DecoratedKey(range.right, null);

        QueryFilter filter = new QueryFilter(null, new QueryPath(columnFamily, superColumn, null), columnFilter);
        int gcBefore = (int) (System.currentTimeMillis() / 1000) - metadata.getGcGraceSeconds();

        DataTracker.View currentView = data.getView();
        Collection<Memtable> memtables = new ArrayList<Memtable>();
        memtables.add(currentView.memtable);
        memtables.addAll(currentView.memtablesPendingFlush);
        // It is fine to aliases the View.sstables since it's an unmodifiable collection
        Collection<SSTableReader> sstables = currentView.sstables;

        RowIterator iterator = RowIteratorFactory.getIterator(memtables, sstables, startWith, stopAt, filter,
                getComparator(), this);
        List<Row> rows = new ArrayList<Row>();

        try {
            // pull rows out of the iterator
            boolean first = true;
            while (iterator.hasNext()) {
                Row current = iterator.next();
                DecoratedKey key = current.key;

                if (!stopAt.isEmpty() && stopAt.compareTo(key) < 0)
                    return rows;

                // skip first one
                if (range instanceof Bounds || !first || !key.equals(startWith)) {
                    // TODO this is necessary because when we collate supercolumns together, we don't check
                    // their subcolumns for relevance, so we need to do a second prune post facto here.
                    rows.add(current.cf != null && current.cf.isSuper()
                            ? new Row(current.key, ColumnFamilyStore.removeDeleted(current.cf, gcBefore))
                            : current);
                    if (logger.isDebugEnabled())
                        logger.debug("scanned " + key);
                }
                first = false;

                if (rows.size() >= maxResults)
                    return rows;
            }
        } finally {
            try {
                iterator.close();
            } catch (IOException e) {
                throw new IOError(e);
            }
        }

        return rows;
    }

    public List<Row> scan(IndexClause clause, AbstractBounds range, IFilter dataFilter) {
        // Start with the most-restrictive indexed clause, then apply remaining clauses
        // to each row matching that clause.
        // TODO: allow merge join instead of just one index + loop
        IndexExpression primary = highestSelectivityPredicate(clause);
        ColumnFamilyStore indexCFS = getIndexedColumnFamilyStore(primary.column_name);
        if (logger.isDebugEnabled())
            logger.debug("Primary scan clause is " + getComparator().getString(primary.column_name));
        assert indexCFS != null;
        DecoratedKey indexKey = indexCFS.partitioner.decorateKey(primary.value);

        // if the slicepredicate doesn't contain all the columns for which we have expressions to evaluate,
        // it needs to be expanded to include those too
        IFilter firstFilter = dataFilter;
        NamesQueryFilter extraFilter = null;
        if (clause.expressions.size() > 1) {
            if (dataFilter instanceof SliceQueryFilter) {
                // if we have a high chance of getting all the columns in a single index slice, do that.
                // otherwise, create an extraFilter to fetch by name the columns referenced by the additional expressions.
                if (getMaxRowSize() < DatabaseDescriptor.getColumnIndexSize()) {
                    logger.debug("Expanding slice filter to entire row to cover additional expressions");
                    firstFilter = new SliceQueryFilter(ByteBufferUtil.EMPTY_BYTE_BUFFER,
                            ByteBufferUtil.EMPTY_BYTE_BUFFER, ((SliceQueryFilter) dataFilter).reversed,
                            Integer.MAX_VALUE);
                } else {
                    logger.debug("adding extraFilter to cover additional expressions");
                    SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(getComparator());
                    for (IndexExpression expr : clause.expressions) {
                        if (expr == primary)
                            continue;
                        columns.add(expr.column_name);
                    }
                    extraFilter = new NamesQueryFilter(columns);
                }
            } else {
                logger.debug("adding columns to firstFilter to cover additional expressions");
                // just add in columns that are not part of the resultset
                assert dataFilter instanceof NamesQueryFilter;
                SortedSet<ByteBuffer> columns = new TreeSet<ByteBuffer>(getComparator());
                for (IndexExpression expr : clause.expressions) {
                    if (expr == primary || ((NamesQueryFilter) dataFilter).columns.contains(expr.column_name))
                        continue;
                    columns.add(expr.column_name);
                }
                if (columns.size() > 0) {
                    columns.addAll(((NamesQueryFilter) dataFilter).columns);
                    firstFilter = new NamesQueryFilter(columns);
                }
            }
        }

        List<Row> rows = new ArrayList<Row>();
        ByteBuffer startKey = clause.start_key;
        QueryPath path = new QueryPath(columnFamily);

        // we need to store last data key accessed to avoid duplicate results
        // because in the while loop new iteration we can access the same column if start_key was not set
        ByteBuffer lastDataKey = null;

        // fetch row keys matching the primary expression, fetch the slice predicate for each
        // and filter by remaining expressions.  repeat until finished w/ assigned range or index row is exhausted.
        outer: while (true) {
            /* we don't have a way to get the key back from the DK -- we just have a token --
             * so, we need to loop after starting with start_key, until we get to keys in the given `range`.
             * But, if the calling StorageProxy is doing a good job estimating data from each range, the range
             * should be pretty close to `start_key`. */
            if (logger.isDebugEnabled())
                logger.debug(String.format("Scanning index %s starting with %s", expressionString(primary),
                        indexCFS.getComparator().getString(startKey)));

            // We shouldn't fetch only 1 row as this provides buggy paging in case the first row doesn't satisfy all clauses
            int count = Math.max(clause.count, 2);
            QueryFilter indexFilter = QueryFilter.getSliceFilter(indexKey,
                    new QueryPath(indexCFS.getColumnFamilyName()), startKey, ByteBufferUtil.EMPTY_BYTE_BUFFER,
                    false, count);
            ColumnFamily indexRow = indexCFS.getColumnFamily(indexFilter);
            logger.debug("fetched {}", indexRow);
            if (indexRow == null)
                break;

            ByteBuffer dataKey = null;
            int n = 0;
            for (IColumn column : indexRow.getSortedColumns()) {
                if (column.isMarkedForDelete())
                    continue;
                dataKey = column.name();
                n++;

                DecoratedKey dk = partitioner.decorateKey(dataKey);
                if (!range.right.equals(partitioner.getMinimumToken()) && range.right.compareTo(dk.token) < 0)
                    break outer;
                if (!range.contains(dk.token) || dataKey.equals(lastDataKey))
                    continue;

                // get the row columns requested, and additional columns for the expressions if necessary
                ColumnFamily data = getColumnFamily(new QueryFilter(dk, path, firstFilter));
                assert data != null : String.format(
                        "No data found for %s in %s:%s (original filter %s) from expression %s", firstFilter, dk,
                        path, dataFilter, expressionString(primary));
                logger.debug("fetched data row {}", data);
                if (extraFilter != null) {
                    // we might have gotten the expression columns in with the main data slice, but
                    // we can't know for sure until that slice is done.  So, we'll do the extra query
                    // if we go through and any expression columns are not present.
                    for (IndexExpression expr : clause.expressions) {
                        if (expr != primary && data.getColumn(expr.column_name) == null) {
                            data.addAll(getColumnFamily(new QueryFilter(dk, path, extraFilter)));
                            break;
                        }
                    }
                }

                if (satisfies(data, clause, primary)) {
                    logger.debug("row {} satisfies all clauses", data);
                    // cut the resultset back to what was requested, if necessary
                    if (firstFilter != dataFilter) {
                        ColumnFamily expandedData = data;
                        data = expandedData.cloneMeShallow();
                        IColumnIterator iter = dataFilter.getMemtableColumnIterator(expandedData, dk,
                                getComparator());
                        new QueryFilter(dk, path, dataFilter).collectCollatedColumns(data, iter, gcBefore());
                    }

                    rows.add(new Row(dk, data));
                }

                if (rows.size() == clause.count)
                    break outer;
            }
            if (n < clause.count || startKey.equals(dataKey))
                break;

            lastDataKey = startKey = dataKey;
        }

        return rows;
    }

    private String expressionString(IndexExpression expr) {
        return String.format("'%s.%s %s %s'", columnFamily, getComparator().getString(expr.column_name), expr.op,
                metadata.getColumn_metadata().get(expr.column_name).getValidator().getString(expr.value));
    }

    private IndexExpression highestSelectivityPredicate(IndexClause clause) {
        IndexExpression best = null;
        int bestMeanCount = Integer.MAX_VALUE;
        for (IndexExpression expression : clause.expressions) {
            ColumnFamilyStore cfs = getIndexedColumnFamilyStore(expression.column_name);
            if (cfs == null || !expression.op.equals(IndexOperator.EQ))
                continue;
            int columns = cfs.getMeanColumns();
            if (columns < bestMeanCount) {
                best = expression;
                bestMeanCount = columns;
            }
        }
        return best;
    }

    private static boolean satisfies(ColumnFamily data, IndexClause clause, IndexExpression first) {
        for (IndexExpression expression : clause.expressions) {
            // (we can skip "first" since we already know it's satisfied)
            if (expression == first)
                continue;
            // check column data vs expression
            IColumn column = data.getColumn(expression.column_name);
            if (column == null)
                return false;
            int v = data.metadata().getValueValidator(expression.column_name).compare(column.value(),
                    expression.value);
            if (!satisfies(v, expression.op))
                return false;
        }
        return true;
    }

    private static boolean satisfies(int comparison, IndexOperator op) {
        switch (op) {
        case EQ:
            return comparison == 0;
        case GTE:
            return comparison >= 0;
        case GT:
            return comparison > 0;
        case LTE:
            return comparison <= 0;
        case LT:
            return comparison < 0;
        default:
            throw new IllegalStateException();
        }
    }

    public AbstractType getComparator() {
        return metadata.comparator;
    }

    private void snapshotWithoutFlush(String snapshotName) {
        for (ColumnFamilyStore cfs : concatWithIndexes()) {
            for (SSTableReader ssTable : cfs.data.getSSTables()) {
                try {
                    // mkdir
                    File dataDirectory = ssTable.descriptor.directory.getParentFile();
                    String snapshotDirectoryPath = Table.getSnapshotPath(dataDirectory.getAbsolutePath(),
                            table.name, snapshotName);
                    FileUtils.createDirectory(snapshotDirectoryPath);

                    // hard links
                    ssTable.createLinks(snapshotDirectoryPath);
                    if (logger.isDebugEnabled())
                        logger.debug("Snapshot for " + table + " keyspace data file " + ssTable.getFilename()
                                + " created in " + snapshotDirectoryPath);
                } catch (IOException e) {
                    throw new IOError(e);
                }
            }
        }
    }

    /**
     * Take a snap shot of this columnfamily store.
     * 
     * @param snapshotName the name of the associated with the snapshot 
     */
    public void snapshot(String snapshotName) {
        try {
            forceBlockingFlush();
        } catch (ExecutionException e) {
            throw new RuntimeException(e);
        } catch (InterruptedException e) {
            throw new AssertionError(e);
        }

        snapshotWithoutFlush(snapshotName);
    }

    public boolean hasUnreclaimedSpace() {
        return data.getLiveSize() < data.getTotalSize();
    }

    public long getTotalDiskSpaceUsed() {
        return data.getTotalSize();
    }

    public long getLiveDiskSpaceUsed() {
        return data.getLiveSize();
    }

    public int getLiveSSTableCount() {
        return data.getSSTables().size();
    }

    /** raw cached row -- does not fetch the row if it is not present.  not counted in cache statistics.  */
    public ColumnFamily getRawCachedRow(DecoratedKey key) {
        return rowCache.getCapacity() == 0 ? null : rowCache.getInternal(key);
    }

    public void invalidateCachedRow(DecoratedKey key) {
        rowCache.remove(key);
    }

    public void forceMajorCompaction() throws InterruptedException, ExecutionException {
        CompactionManager.instance.performMajor(this);
    }

    public void invalidateRowCache() {
        rowCache.clear();
    }

    public void invalidateKeyCache() {
        keyCache.clear();
    }

    public int getRowCacheCapacity() {
        return rowCache.getCapacity();
    }

    public int getKeyCacheCapacity() {
        return keyCache.getCapacity();
    }

    public int getRowCacheSize() {
        return rowCache.size();
    }

    public int getKeyCacheSize() {
        return keyCache.size();
    }

    public static Iterable<ColumnFamilyStore> all() {
        Iterable<ColumnFamilyStore>[] stores = new Iterable[DatabaseDescriptor.getTables().size()];
        int i = 0;
        for (Table table : Table.all()) {
            stores[i++] = table.getColumnFamilyStores();
        }
        return Iterables.concat(stores);
    }

    public Iterable<DecoratedKey> allKeySamples() {
        Collection<SSTableReader> sstables = getSSTables();
        Iterable<DecoratedKey>[] samples = new Iterable[sstables.size()];
        int i = 0;
        for (SSTableReader sstable : sstables) {
            samples[i++] = sstable.getKeySamples();
        }
        return Iterables.concat(samples);
    }

    public Iterable<DecoratedKey> keySamples(Range range) {
        Collection<SSTableReader> sstables = getSSTables();
        Iterable<DecoratedKey>[] samples = new Iterable[sstables.size()];
        int i = 0;
        for (SSTableReader sstable : sstables) {
            samples[i++] = sstable.getKeySamples(range);
        }
        return Iterables.concat(samples);
    }

    /**
     * For testing.  no effort is made to clear historical memtables, nor for
     * thread safety
     */
    void clearUnsafe() {
        data.init();
    }

    /**
     * Truncate practically deletes the entire column family's data
     * @return a Future to the delete operation. Call the future's get() to make
     * sure the column family has been deleted
     */
    public Future<?> truncate() throws IOException {
        // snapshot will also flush, but we want to truncate the most possible, and anything in a flush written
        // after truncateAt won't be truncated.
        try {
            forceBlockingFlush();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }

        final long truncatedAt = System.currentTimeMillis();
        snapshot(Table.getTimestampedSnapshotName("before-truncate"));

        Runnable runnable = new WrappedRunnable() {
            public void runMayThrow() throws InterruptedException, IOException {
                // putting markCompacted on the commitlogUpdater thread ensures it will run
                // after any compactions that were in progress when truncate was called, are finished
                for (ColumnFamilyStore cfs : concatWithIndexes()) {
                    List<SSTableReader> truncatedSSTables = new ArrayList<SSTableReader>();
                    for (SSTableReader sstable : cfs.getSSTables()) {
                        if (!sstable.newSince(truncatedAt))
                            truncatedSSTables.add(sstable);
                    }
                    cfs.data.markCompacted(truncatedSSTables);
                }

                // Invalidate row cache
                invalidateRowCache();
            }
        };

        return postFlushExecutor.submit(runnable);
    }

    // if this errors out, we are in a world of hurt.
    public void renameSSTables(String newCfName) throws IOException {
        // complete as much of the job as possible.  Don't let errors long the way prevent as much renaming as possible
        // from happening.
        IOException mostRecentProblem = null;
        for (File existing : DefsTable.getFiles(table.name, columnFamily)) {
            try {
                String newFileName = existing.getName().replaceFirst("\\w+-", newCfName + "-");
                FileUtils.renameWithConfirm(existing, new File(existing.getParent(), newFileName));
            } catch (IOException ex) {
                mostRecentProblem = ex;
            }
        }
        if (mostRecentProblem != null)
            throw new IOException(
                    "One or more IOExceptions encountered while renaming files. Most recent problem is included.",
                    mostRecentProblem);

        for (ColumnFamilyStore indexedCfs : indexedColumns.values()) {
            indexedCfs.renameSSTables(indexedCfs.columnFamily.replace(columnFamily, newCfName));
        }
    }

    public long getBloomFilterFalsePositives() {
        return data.getBloomFilterFalsePositives();
    }

    public long getRecentBloomFilterFalsePositives() {
        return data.getRecentBloomFilterFalsePositives();
    }

    public double getBloomFilterFalseRatio() {
        return data.getBloomFilterFalseRatio();
    }

    public double getRecentBloomFilterFalseRatio() {
        return data.getRecentBloomFilterFalseRatio();
    }

    public SortedSet<ByteBuffer> getIndexedColumns() {
        return indexedColumns.keySet();
    }

    public ColumnFamilyStore getIndexedColumnFamilyStore(ByteBuffer column) {
        return indexedColumns.get(column);
    }

    public ColumnFamily newIndexedColumnFamily(ByteBuffer column) {
        return ColumnFamily.create(indexedColumns.get(column).metadata);
    }

    public DecoratedKey<LocalToken> getIndexKeyFor(ByteBuffer name, ByteBuffer value) {
        return indexedColumns.get(name).partitioner.decorateKey(value);
    }

    @Override
    public String toString() {
        return "ColumnFamilyStore(" + "table='" + table.name + '\'' + ", columnFamily='" + columnFamily + '\''
                + ')';
    }

    public void disableAutoCompaction() {
        minCompactionThreshold.set(0);
        maxCompactionThreshold.set(0);
    }

    /*
     JMX getters and setters for the Default<T>s.
       - get/set minCompactionThreshold
       - get/set maxCompactionThreshold
       - get     memsize
       - get     memops
       - get/set memtime
       - get/set rowCacheSavePeriodInSeconds
       - get/set keyCacheSavePeriodInSeconds
     */
    public int getMinimumCompactionThreshold() {
        return minCompactionThreshold.value();
    }

    public void setMinimumCompactionThreshold(int minCompactionThreshold) {
        if ((minCompactionThreshold > this.maxCompactionThreshold.value())
                && this.maxCompactionThreshold.value() != 0) {
            throw new RuntimeException("The min_compaction_threshold cannot be larger than the max.");
        }
        this.minCompactionThreshold.set(minCompactionThreshold);
    }

    public int getMaximumCompactionThreshold() {
        return maxCompactionThreshold.value();
    }

    public void setMaximumCompactionThreshold(int maxCompactionThreshold) {
        if (maxCompactionThreshold < this.minCompactionThreshold.value()) {
            throw new RuntimeException("The max_compaction_threshold cannot be smaller than the min.");
        }
        this.maxCompactionThreshold.set(maxCompactionThreshold);
    }

    public int getMemtableFlushAfterMins() {
        return memtime.value();
    }

    public void setMemtableFlushAfterMins(int time) {
        if (time <= 0) {
            throw new RuntimeException("MemtableFlushAfterMins must be greater than 0.");
        }
        this.memtime.set(time);
    }

    public int getMemtableThroughputInMB() {
        return memsize.value();
    }

    public void setMemtableThroughputInMB(int size) throws ConfigurationException {
        DatabaseDescriptor.validateMemtableThroughput(size);
        memsize.set(size);
    }

    public double getMemtableOperationsInMillions() {
        return memops.value();
    }

    public void setMemtableOperationsInMillions(double ops) throws ConfigurationException {
        DatabaseDescriptor.validateMemtableOperations(ops);
        memops.set(ops);
    }

    public int getRowCacheSavePeriodInSeconds() {
        return rowCacheSaveInSeconds.value();
    }

    public void setRowCacheSavePeriodInSeconds(int rcspis) {
        if (rcspis < 0) {
            throw new RuntimeException("RowCacheSavePeriodInSeconds must be non-negative.");
        }
        this.rowCacheSaveInSeconds.set(rcspis);
        scheduleCacheSaving(rowCacheSaveInSeconds.value(), keyCacheSaveInSeconds.value());
    }

    public int getKeyCacheSavePeriodInSeconds() {
        return keyCacheSaveInSeconds.value();
    }

    public void setKeyCacheSavePeriodInSeconds(int kcspis) {
        if (kcspis < 0) {
            throw new RuntimeException("KeyCacheSavePeriodInSeconds must be non-negative.");
        }
        this.keyCacheSaveInSeconds.set(kcspis);
        scheduleCacheSaving(rowCacheSaveInSeconds.value(), keyCacheSaveInSeconds.value());
    }
    // End JMX get/set.

    public long estimateKeys() {
        return data.estimatedKeys();
    }

    /**
     * Resizes the key and row caches based on the current key estimate.
     */
    public synchronized void updateCacheSizes() {
        long keys = estimateKeys();
        keyCache.updateCacheSize(keys);
        rowCache.updateCacheSize(keys);
    }

    public long[] getEstimatedRowSizeHistogram() {
        return data.getEstimatedRowSizeHistogram();
    }

    public long[] getEstimatedColumnCountHistogram() {
        return data.getEstimatedRowSizeHistogram();
    }

    /**
     * Check if index is already built for current store
     * @return true if built, false otherwise
     */
    public boolean isIndexBuilt() {
        return SystemTable.isIndexBuilt(table.name, columnFamily);
    }

    /**
     * Returns a list of the names of the built column indexes for current store
     * @return list of the index names
     */
    public List<String> getBuiltIndexes() {
        List<String> indexes = new ArrayList<String>();

        for (ColumnFamilyStore cfs : indexedColumns.values()) {
            if (cfs.isIndexBuilt()) {
                indexes.add(cfs.columnFamily); // store.columnFamily represents a name of the index
            }
        }

        return indexes;
    }

    /** true if this CFS contains secondary index data */
    public boolean isIndex() {
        return partitioner instanceof LocalPartitioner;
    }

    private String getParentColumnfamily() {
        assert isIndex();
        return columnFamily.split("\\.")[0];
    }

    /**
     * sets each cache's maximum capacity to 75% of its current size
     */
    public void reduceCacheSizes() {
        rowCache.reduceCacheSize();
        keyCache.reduceCacheSize();
    }

    private ByteBuffer intern(ByteBuffer name) {
        ByteBuffer internedName = internedNames.get(name);
        if (internedName == null) {
            internedName = ByteBufferUtil.clone(name);
            ByteBuffer concurrentName = internedNames.putIfAbsent(internedName, internedName);
            if (concurrentName != null)
                internedName = concurrentName;
        }
        return internedName;
    }

    public ByteBuffer internOrCopy(ByteBuffer name) {
        if (internedNames.size() >= INTERN_CUTOFF)
            return ByteBufferUtil.clone(name);

        return intern(name);
    }

    public ByteBuffer maybeIntern(ByteBuffer name) {
        if (internedNames.size() >= INTERN_CUTOFF)
            return name;

        return intern(name);
    }

    public SSTableWriter createFlushWriter(long estimatedRows, long estimatedSize, ReplayPosition context)
            throws IOException {
        return new SSTableWriter(getFlushPath(estimatedSize, Descriptor.CURRENT_VERSION), estimatedRows, metadata,
                partitioner, context);
    }

    public SSTableWriter createCompactionWriter(long estimatedRows, String location,
            Collection<SSTableReader> sstables) throws IOException {
        ReplayPosition rp = ReplayPosition.getReplayPosition(sstables);
        return new SSTableWriter(getTempSSTablePath(location), estimatedRows, metadata, partitioner, rp);
    }

    public Iterable<ColumnFamilyStore> concatWithIndexes() {
        return Iterables.concat(indexedColumns.values(), Collections.singleton(this));
    }

    public Set<Memtable> getMemtablesPendingFlush() {
        return data.getMemtablesPendingFlush();
    }
}