org.apache.blur.utils.TableShardCountCollapser.java Source code

Introduction

Here is the source code for org.apache.blur.utils.TableShardCountCollapser.java
Source

package org.apache.blur.utils;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import static org.apache.blur.lucene.LuceneVersionConstant.LUCENE_VERSION;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.blur.lucene.codec.Blur024Codec;
import org.apache.blur.store.hdfs.HdfsDirectory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;

/**
 * This class is used to reduce the total number of shards of a table. The main
 * use would be if during an indexing job the number of reducers were increased
 * to make indexing faster, but the total number of shards in the table needed
 * to be smaller. This utility safely collapses indexes together thus reducing
 * the total number of shards in the table.
 * 
 * For example if you wanted to run 1024 reducers but only wanted to run 128
 * shards in a table. After the bulk map reducer job finishes, this utility
 * could be executed:
 * 
 * TableShardCountCollapser <hdfs path> 128
 * 
 * The result would be 128 shards in the table path.
 * 
 */
public class TableShardCountCollapser extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        // Let ToolRunner handle generic command-line options
        int res = ToolRunner.run(new Configuration(), new TableShardCountCollapser(), args);
        System.exit(res);
    }

    private Path path;

    @Override
    public int run(String[] args) throws Exception {
        // prompt to make sure the table is not enabled

        Path path = new Path(args[0]);
        int count = Integer.parseInt(args[1]);
        setTablePath(path);
        collapseShardsTo(count);
        return 0;
    }

    public boolean validateCount(int count) throws IOException {
        if (getCollapsePossibilities().contains(count)) {
            return true;
        }
        return false;
    }

    public void setTablePath(Path path) {
        this.path = path;
    }

    public List<Integer> getCollapsePossibilities() throws IOException {
        FileSystem fileSystem = path.getFileSystem(getConf());
        FileStatus[] listStatus = fileSystem.listStatus(path);
        SortedSet<String> shards = new TreeSet<String>();
        for (FileStatus status : listStatus) {
            Path shardPath = status.getPath();
            if (shardPath.getName().startsWith(BlurConstants.SHARD_PREFIX)) {
                shards.add(shardPath.getName());
            }
        }
        validateShards(shards);
        List<Integer> result = getFactors(shards.size());
        return result;
    }

    private List<Integer> getFactors(int size) {
        List<Integer> result = new ArrayList<Integer>();
        for (int i = 1; i < size; i++) {
            if (size % i == 0) {
                result.add(i);
            }
        }
        return result;
    }

    private void validateShards(SortedSet<String> shards) {
        int count = shards.size();
        for (int i = 0; i < count; i++) {
            if (!shards.contains(ShardUtil.getShardName(i))) {
                throw new RuntimeException("Invalid table");
            }
        }
    }

    public void collapseShardsTo(int newShardCount) throws IOException {
        if (!validateCount(newShardCount)) {
            throw new RuntimeException("Count [" + newShardCount + "] is not valid, valid values are ["
                    + getCollapsePossibilities() + "]");
        }

        Path[] paths = getPaths();
        int numberOfShardsToMergePerPass = paths.length / newShardCount;
        for (int i = 0; i < newShardCount; i++) {
            System.out.println("Base Index [" + paths[i] + "]");
            IndexWriterConfig lconf = new IndexWriterConfig(LUCENE_VERSION, new KeywordAnalyzer());
            lconf.setCodec(new Blur024Codec());
            HdfsDirectory dir = new HdfsDirectory(getConf(), paths[i]);
            IndexWriter indexWriter = new IndexWriter(dir, lconf);
            Directory[] dirs = new Directory[numberOfShardsToMergePerPass - 1];
            Path[] pathsToDelete = new Path[numberOfShardsToMergePerPass - 1];
            for (int p = 1; p < numberOfShardsToMergePerPass; p++) {
                Path pathToMerge = paths[i + p * newShardCount];
                System.out.println("Merge [" + pathToMerge + "]");
                dirs[p - 1] = new HdfsDirectory(getConf(), pathToMerge);
                pathsToDelete[p - 1] = pathToMerge;
            }
            indexWriter.addIndexes(dirs);
            // Causes rewrite of of index and the symlinked files are
            // merged/rewritten.
            indexWriter.forceMerge(1);
            indexWriter.close();
            FileSystem fileSystem = path.getFileSystem(getConf());
            for (Path p : pathsToDelete) {
                fileSystem.delete(p, true);
            }
        }
    }

    private Path[] getPaths() throws IOException {
        FileSystem fileSystem = path.getFileSystem(getConf());
        FileStatus[] listStatus = fileSystem.listStatus(path);
        SortedSet<Path> shards = new TreeSet<Path>();
        for (FileStatus status : listStatus) {
            Path shardPath = status.getPath();
            if (shardPath.getName().startsWith(BlurConstants.SHARD_PREFIX)) {
                shards.add(shardPath);
            }
        }
        return shards.toArray(new Path[shards.size()]);
    }

}