notaql.engines.hbase.HBaseEngineEvaluator.java Source code

Introduction

Here is the source code for notaql.engines.hbase.HBaseEngineEvaluator.java
Source

/*
 * Copyright 2015 by Thomas Lottermann
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package notaql.engines.hbase;

import notaql.NotaQL;
import notaql.datamodel.*;
import notaql.engines.Engine;
import notaql.engines.EngineEvaluator;
import notaql.engines.hbase.datamodel.ValueConverter;
import notaql.engines.hbase.model.vdata.ColCountFunctionVData;
import notaql.engines.hbase.parser.path.HBaseInputPathParser;
import notaql.engines.hbase.parser.path.HBaseOutputPathParser;
import notaql.engines.hbase.parser.path.HBaseOutputPathVisitor;
import notaql.evaluation.SparkTransformationEvaluator;
import notaql.model.EvaluationException;
import notaql.model.Transformation;
import notaql.model.vdata.ConstructorVData;
import notaql.model.vdata.FunctionVData;
import notaql.parser.TransformationParser;
import notaql.parser.path.InputPathParser;
import notaql.parser.path.OutputPathParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapred.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.JobConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.util.stream.Collectors;

/**
 * This is the spark implementation of HBase evaluation.
 * It uses the Hadoop connector of HBase in order to read and write data.
 *
 * NOTE: the assumption is that all input values are stored as strings.
 *
 * TODO: Allow more input formats. This could be done by introducing cast functions.
 */
public class HBaseEngineEvaluator implements EngineEvaluator {
    private final Engine engine;
    private final String tableId;
    private final boolean allDefault;

    private final static Logger logger = Logger.getLogger(HBaseEngineEvaluator.class.getName());
    private final TransformationParser parser;

    public HBaseEngineEvaluator(Engine engine, TransformationParser parser, Map<String, AtomValue<?>> params) {
        this.engine = engine;
        this.parser = parser;

        if (!params.keySet().contains("table_id"))
            throw new EvaluationException("HBase engine expects the following parameters on initialization: "
                    + engine.getArguments().stream().collect(Collectors.joining(", ")));

        this.tableId = params.get("table_id").getValue().toString();
        this.allDefault = (Boolean) params.getOrDefault("all_default", new BooleanValue(false)).getValue();
    }

    @Override
    public InputPathParser getInputPathParser() {
        return new HBaseInputPathParser(parser);
    }

    @Override
    public OutputPathParser getOutputPathParser() {
        return new HBaseOutputPathParser(parser);
    }

    @Override
    public JavaRDD<ObjectValue> evaluate(Transformation transformation) {
        final SparkTransformationEvaluator evaluator = new SparkTransformationEvaluator(transformation);
        JavaSparkContext sc = NotaQL.SparkFactory.getSparkContext();

        final Configuration conf = createConf();
        conf.set(TableInputFormat.INPUT_TABLE, tableId);

        final JavaPairRDD<ImmutableBytesWritable, Result> inputRDD = sc.newAPIHadoopRDD(conf,
                TableInputFormat.class, ImmutableBytesWritable.class, org.apache.hadoop.hbase.client.Result.class);

        // convert all rows in rdd to inner format
        final JavaRDD<Value> converted = inputRDD.map(t -> ValueConverter.convertToNotaQL(t._2));
        // filter the ones not fulfilling the input filter
        final JavaRDD<Value> filtered = converted.filter(v -> transformation.satisfiesInPredicate((ObjectValue) v));

        // process all input
        return evaluator.process(filtered);
    }

    @Override
    public void store(JavaRDD<ObjectValue> result) {
        if (NotaQL.prop.getProperty("log_output") != null && NotaQL.prop.getProperty("log_output").equals("true")) {
            final List<ObjectValue> collect = result.collect();
            collect.stream().forEach(t -> logger.info("Storing object: " + t.toString()));
        } else {
            logger.info("Storing objects.");
        }

        final Configuration conf = createConf();

        // create Table if it doesn't exist yet
        final HBaseAdmin admin;
        try {
            admin = new HBaseAdmin(conf);

            if (!admin.tableExists(tableId)) {
                logger.info("Table " + tableId + " does not exists yet.");
                final HTableDescriptor descriptor = new HTableDescriptor(TableName.valueOf(tableId));
                descriptor.addFamily(new HColumnDescriptor(HBaseOutputPathVisitor.DEFAULT_COL_FAMILY));
                admin.createTable(descriptor);
            }

            if (!allDefault) {
                final HTable table = new HTable(conf, tableId);

                final List<String> families = result.flatMap(ObjectValue::keySet).map(Step::getStep)
                        .filter(s -> !s.equals("_id") && !s.equals(HBaseOutputPathVisitor.DEFAULT_COL_FAMILY))
                        .distinct().collect();

                admin.disableTable(table.getTableName());
                families.forEach(s -> createFamily(s, table));
                admin.enableTable(table.getTableName());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        logger.info("Storing result.");

        JavaSparkContext sc = NotaQL.SparkFactory.getSparkContext();

        final JobConf jobConf = new JobConf(conf, this.getClass());
        jobConf.setOutputFormat(TableOutputFormat.class);
        jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableId);

        JavaPairRDD<ImmutableBytesWritable, Put> output = result.filter(HBaseEngineEvaluator::hasCols)
                .mapToPair(o -> new Tuple2<>(new ImmutableBytesWritable(), ValueConverter.convertFromNotaQL(o)));

        output.saveAsHadoopDataset(jobConf);
        logger.info("Stored result.");
    }

    private static boolean hasCols(ObjectValue o) {
        return o.toMap().entrySet().stream().filter(e -> e.getValue() instanceof ObjectValue)
                .anyMatch(e -> ((ObjectValue) e.getValue()).size() > 0);
    }

    public static void createFamily(String family, HTable table) {
        try {
            final HBaseAdmin admin = new HBaseAdmin(table.getConfiguration());
            // check if column family exists
            boolean exists = false;
            for (HColumnDescriptor familyDescriptor : table.getTableDescriptor().getFamilies()) {
                if (Bytes.toString(familyDescriptor.getName()).equals(family)) {
                    exists = true;
                    break;
                }
            }
            // if not: add it
            if (!exists) {
                admin.addColumn(table.getTableName(), new HColumnDescriptor(family));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private Configuration createConf() {
        final String host = NotaQL.prop.getProperty("hbase_host", "localhost");
        Configuration conf = HBaseConfiguration.create();

        conf.set("hbase.zookeeper.quorum", host);
        conf.set("hbase.master", host + ":60000");

        return conf;
    }

    @Override
    public ConstructorVData getConstructor(String name) {
        return null;
    }

    @Override
    public FunctionVData getFunction(String name) {
        if (name.equals("COL_COUNT"))
            return new ColCountFunctionVData();
        return null;
    }

}