com.cloudera.impala.catalog.HdfsPartitionLocationCompressor.java Source code

Introduction

Here is the source code for com.cloudera.impala.catalog.HdfsPartitionLocationCompressor.java
Source

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package com.cloudera.impala.catalog;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.cloudera.impala.common.Pair;
import com.cloudera.impala.thrift.THdfsPartitionLocation;
import com.cloudera.impala.util.ListMap;
import com.google.common.base.Preconditions;

/**
 * Utility class for storing HdfsPartition locations in a comrpessed format.  Each
 * instance of this class is owned by a single HdfsTable instance.
 *
 * This class is not thread-safe by itself since it is only modified when the lock on an
 * HdfsTable object is held.
 *
 * TODO: Generalize this to compress other sets of Strings that are likely to share common
 * prefixes, like table locations.
 *
 */
class HdfsPartitionLocationCompressor {
    int numClusteringColumns_;

    // A bi-directional map between partition location prefixes and their compressed
    // representation, an int.
    final private ListMap<String> prefixMap_ = new ListMap<String>();

    public HdfsPartitionLocationCompressor(int numClusteringColumns) {
        numClusteringColumns_ = numClusteringColumns;
    }

    // Construct an HdfsPartitionLocationCompressor with a pre-filled bidirectional map
    // (indexToPrefix_, prefixToIndex_).
    public HdfsPartitionLocationCompressor(int numClusteringColumns, ArrayList<String> prefixes) {
        numClusteringColumns_ = numClusteringColumns;
        prefixMap_.populate(prefixes);
    }

    public void setClusteringColumns(int numClusteringColumns) {
        numClusteringColumns_ = numClusteringColumns;
    }

    public List<String> getPrefixes() {
        return prefixMap_.getList();
    }

    // One direction of the map: returns the prefix associated with an index, or "" is the
    // index is -1. Indexes less than -1 or greater than indexToPrefix_.size()-1 are invalid
    // and casue and IllegalArgumentException to be thrown.
    private String indexToPrefix(int i) {
        // Uncompressed location are represented by -1:
        if (i == -1)
            return "";
        Preconditions.checkElementIndex(i, prefixMap_.size());
        return prefixMap_.getEntry(i);
    }

    // Compress a location prefix, adding it to the bidirectional map (indexToPrefix_,
    // prefixToIndex_) if it is not already present.
    private int prefixToIndex(String s) {
        return prefixMap_.getIndex(s);
    }

    // A surrogate for THdfsPartitionLocation, which represents a partition's location
    // relative to its parent table's list of partition prefixes.
    public class Location {
        // 'prefix_index_' represents the portion of the partition's location that comes before
        // the last N directories, where N is the number of partitioning columns.
        // 'prefix_index_' is an index into
        // HdfsPartitionLocationCompressor.this.indexToPrefix_. 'suffix_' is the rest of the
        // partition location.
        //
        // TODO: Since each partition stores the literal values for the partitioning columns,
        // we could also elide the column names and values from suffix_ when a partition is in
        // the canonical location "/partitioning_column_name_1=value_1/..."
        private final int prefix_index_;
        private final String suffix_;

        public Location(String location) {
            Preconditions.checkNotNull(location);
            Pair<String, String> locationParts = decompose(location);
            prefix_index_ = HdfsPartitionLocationCompressor.this.prefixToIndex(locationParts.first);
            suffix_ = locationParts.second;
        }

        public Location(THdfsPartitionLocation thrift) {
            Preconditions.checkNotNull(thrift);
            prefix_index_ = thrift.prefix_index;
            suffix_ = thrift.getSuffix();
        }

        public THdfsPartitionLocation toThrift() {
            return new THdfsPartitionLocation(prefix_index_, suffix_);
        }

        @Override
        public String toString() {
            return HdfsPartitionLocationCompressor.this.indexToPrefix(prefix_index_) + suffix_;
        }

        @Override
        public int hashCode() {
            return toString().hashCode();
        }

        @Override
        public boolean equals(Object obj) {
            return (obj instanceof Location) && (toString() == obj.toString());
        }

        // Decompose a location string by removing its last N directories, where N is the
        // number of clustering columns. The result is a Pair<String,String> where the first
        // String is the prefix and the second is the suffix. (In orther words, their
        // concatenation equals the input.) If the input does not have at least N '/'
        // characters, the prefix is empty and the suffix is the entire input.
        private Pair<String, String> decompose(String s) {
            Preconditions.checkNotNull(s);
            int numClusteringColumns = HdfsPartitionLocationCompressor.this.numClusteringColumns_;
            if (numClusteringColumns == 0)
                return new Pair<String, String>(s, "");
            // Iterate backwards over the input until we have passed 'numClusteringColumns'
            // directories. What is left is the prefix.
            int i = s.length() - 1;
            // If the string ends in '/', iterating past it does not pass a clustering column.
            if (i >= 0 && s.charAt(i) == '/')
                --i;
            for (; numClusteringColumns > 0 && i >= 0; --i) {
                if (s.charAt(i) == '/')
                    --numClusteringColumns;
            }
            // If we successfully removed all the partition directories, s.charAt(i+1) is '/'
            // and we can include it in the prefix.
            if (0 == numClusteringColumns)
                ++i;
            return new Pair<String, String>(s.substring(0, i + 1), s.substring(i + 1));
        }
    }
}