StorageEngineClient.HashMultiFileColumnStorageInputFormat.java Source code

Introduction

Here is the source code for StorageEngineClient.HashMultiFileColumnStorageInputFormat.java
Source

/**
* Tencent is pleased to support the open source community by making TDW available.
* Copyright (C) 2014 THL A29 Limited, a Tencent company. All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use 
* this file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed 
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
* OF ANY KIND, either express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package StorageEngineClient;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MultiFileInputFormat;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

import FormatStorage1.IRecord;

@SuppressWarnings("deprecation")
public class HashMultiFileColumnStorageInputFormat<K, V> extends MultiFileInputFormat<LongWritable, IRecord> {
    public static final Log LOG = LogFactory.getLog(HashMultiFileColumnStorageInputFormat.class);

    public HashMultiFileColumnStorageInputFormat() {
    }

    public RecordReader<LongWritable, IRecord> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException {
        reporter.setStatus(split.toString());

        return new MultiColumnStorageRecordReader<K, V>(job, (MultiFormatStorageSplit) split);
    }

    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        HashMap<String, FileStatus> files = new HashMap<String, FileStatus>();
        for (FileStatus file : listStatus(job)) {
            String filestr = file.getPath().toString();
            String filekey = filestr.substring(0, filestr.lastIndexOf("_idx"));
            if (!files.containsKey(filekey)) {
                files.put(filekey, file);
            } else {
                if (file.getLen() > files.get(filekey).getLen()) {
                    files.put(filekey, file);
                }
            }
        }

        List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);

        List<Path> paths = new ArrayList<Path>();
        int count = 0;
        BlockLocation[] blkLocations = null;
        for (String filekey : files.keySet()) {
            FileStatus file = files.get(filekey);
            Path path = file.getPath();

            FileSystem fs = path.getFileSystem(job);
            long length = file.getLen();

            if (count == 0) {
                blkLocations = fs.getFileBlockLocations(file, 0, length);
                count++;
            }

            paths.add(new Path(filekey));
        }

        if (paths.size() == 0 || blkLocations == null) {
            splits.add(new MultiFormatStorageSplit(new Path[0], new String[0]));
            return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
        }

        int blkIndex = getBlockIndex(blkLocations, 0);
        MultiFormatStorageSplit split = new MultiFormatStorageSplit(paths.toArray(new Path[paths.size()]),
                blkLocations[blkIndex].getHosts());
        splits.add(split);

        LOG.info("Total # of splits: " + splits.size());
        return splits.toArray(new MultiFormatStorageSplit[splits.size()]);
    }
}