org.apache.carbondata.hive.MapredCarbonInputFormat.java Source code

Introduction

Here is the source code for org.apache.carbondata.hive.MapredCarbonInputFormat.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.carbondata.hive;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.carbondata.common.logging.LogServiceFactory;
import org.apache.carbondata.core.exception.InvalidConfigurationException;
import org.apache.carbondata.core.indexstore.BlockletDetailInfo;
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
import org.apache.carbondata.core.metadata.schema.SchemaReader;
import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
import org.apache.carbondata.core.scan.model.QueryModel;
import org.apache.carbondata.core.scan.model.QueryModelBuilder;
import org.apache.carbondata.core.util.DataTypeConverterImpl;
import org.apache.carbondata.core.util.ObjectSerializationUtil;
import org.apache.carbondata.hadoop.CarbonInputSplit;
import org.apache.carbondata.hadoop.api.CarbonTableInputFormat;
import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.InvalidPathException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;

public class MapredCarbonInputFormat extends CarbonTableInputFormat<ArrayWritable>
        implements InputFormat<Void, ArrayWritable>, CombineHiveInputFormat.AvoidSplitCombination {
    private static final String CARBON_TABLE = "mapreduce.input.carboninputformat.table";

    private static final Logger LOGGER = LogServiceFactory
            .getLogService(MapredCarbonInputFormat.class.getCanonicalName());

    /**
     * this method will read the schema from the physical file and populate into CARBON_TABLE
     *
     * @param configuration
     * @throws IOException
     */
    private static void populateCarbonTable(Configuration configuration, String paths)
            throws IOException, InvalidConfigurationException {
        String dirs = configuration.get(INPUT_DIR, "");
        String[] inputPaths = StringUtils.split(dirs);
        String validInputPath = null;
        if (inputPaths.length == 0) {
            throw new InvalidPathException("No input paths specified in job");
        } else {
            if (paths != null) {
                for (String inputPath : inputPaths) {
                    if (paths.startsWith(inputPath.replace("file:", ""))) {
                        validInputPath = inputPath;
                        break;
                    }
                }
            }
        }
        if (null != validInputPath) {
            AbsoluteTableIdentifier absoluteTableIdentifier = AbsoluteTableIdentifier.from(validInputPath,
                    getDatabaseName(configuration), getTableName(configuration));
            // read the schema file to get the absoluteTableIdentifier having the correct table id
            // persisted in the schema
            CarbonTable carbonTable = SchemaReader.readCarbonTableFromStore(absoluteTableIdentifier);
            configuration.set(CARBON_TABLE, ObjectSerializationUtil.convertObjectToString(carbonTable));
            setTableInfo(configuration, carbonTable.getTableInfo());
        } else {
            throw new InvalidPathException("No input paths specified in job");
        }
    }

    private static CarbonTable getCarbonTable(Configuration configuration, String path)
            throws IOException, InvalidConfigurationException {
        populateCarbonTable(configuration, path);
        // read it from schema file in the store
        String carbonTableStr = configuration.get(CARBON_TABLE);
        return (CarbonTable) ObjectSerializationUtil.convertStringToObject(carbonTableStr);
    }

    @Override
    public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
        org.apache.hadoop.mapreduce.JobContext jobContext = Job.getInstance(jobConf);
        List<org.apache.hadoop.mapreduce.InputSplit> splitList = super.getSplits(jobContext);
        InputSplit[] splits = new InputSplit[splitList.size()];
        CarbonInputSplit split;
        for (int i = 0; i < splitList.size(); i++) {
            split = (CarbonInputSplit) splitList.get(i);
            CarbonHiveInputSplit inputSplit = new CarbonHiveInputSplit(split.getSegmentId(), split.getPath(),
                    split.getStart(), split.getLength(), split.getLocations(), split.getNumberOfBlocklets(),
                    split.getVersion(), split.getBlockStorageIdMap());
            BlockletDetailInfo info = new BlockletDetailInfo();
            info.setBlockSize(split.getLength());
            info.setBlockFooterOffset(split.getDetailInfo().getBlockFooterOffset());
            info.setVersionNumber(split.getVersion().number());
            info.setUseMinMaxForPruning(false);
            inputSplit.setDetailInfo(info);
            splits[i] = inputSplit;
        }
        return splits;
    }

    @Override
    public RecordReader<Void, ArrayWritable> getRecordReader(InputSplit inputSplit, JobConf jobConf,
            Reporter reporter) throws IOException {
        String path = null;
        if (inputSplit instanceof CarbonHiveInputSplit) {
            path = ((CarbonHiveInputSplit) inputSplit).getPath().toString();
        }
        QueryModel queryModel = null;
        try {
            queryModel = getQueryModel(jobConf, path);
        } catch (InvalidConfigurationException e) {
            LOGGER.error("Failed to create record reader: " + e.getMessage());
            return null;
        }
        CarbonReadSupport<ArrayWritable> readSupport = new CarbonDictionaryDecodeReadSupport<>();
        return new CarbonHiveRecordReader(queryModel, readSupport, inputSplit, jobConf);
    }

    private QueryModel getQueryModel(Configuration configuration, String path)
            throws IOException, InvalidConfigurationException {
        CarbonTable carbonTable = getCarbonTable(configuration, path);
        AbsoluteTableIdentifier identifier = carbonTable.getAbsoluteTableIdentifier();
        String projectionString = getProjection(configuration, carbonTable,
                identifier.getCarbonTableIdentifier().getTableName());
        String[] projectionColumns = projectionString.split(",");
        QueryModel queryModel = new QueryModelBuilder(carbonTable).projectColumns(projectionColumns)
                .filterExpression(getFilterPredicates(configuration)).dataConverter(new DataTypeConverterImpl())
                .build();

        return queryModel;
    }

    /**
     * Return the Projection for the CarbonQuery.
     *
     * @param configuration
     * @param carbonTable
     * @param tableName
     * @return
     */
    private String getProjection(Configuration configuration, CarbonTable carbonTable, String tableName) {
        // query plan includes projection column
        String projection = getColumnProjection(configuration);
        if (projection == null) {
            projection = configuration.get("hive.io.file.readcolumn.names");
        }
        List<CarbonColumn> carbonColumns = carbonTable.getCreateOrderColumn(tableName);
        List<String> carbonColumnNames = new ArrayList<>();
        StringBuilder allColumns = new StringBuilder();
        StringBuilder projectionColumns = new StringBuilder();
        for (CarbonColumn column : carbonColumns) {
            carbonColumnNames.add(column.getColName().toLowerCase());
            allColumns.append(column.getColName() + ",");
        }

        if (!projection.equals("")) {
            String[] columnNames = projection.split(",");
            //verify that the columns parsed by Hive exist in the table
            for (String col : columnNames) {
                //show columns command will return these data
                if (carbonColumnNames.contains(col.toLowerCase())) {
                    projectionColumns.append(col + ",");
                }
            }
            return projectionColumns.substring(0, projectionColumns.lastIndexOf(","));
        } else {
            return allColumns.substring(0, allColumns.lastIndexOf(","));
        }
    }

    @Override
    public boolean shouldSkipCombine(Path path, Configuration conf) throws IOException {
        return true;
    }
}