org.eobjects.hadoopdatacleaner.mapreduce.hbase.HBaseTableReducerTest.java Source code

Java tutorial

Introduction

Here is the source code for org.eobjects.hadoopdatacleaner.mapreduce.hbase.HBaseTableReducerTest.java

Source

/**
 * DataCleaner (community edition)
 * Copyright (C) 2013 Human Inference
    
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.eobjects.hadoopdatacleaner.mapreduce.hbase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SortedMapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.apache.hadoop.mrunit.types.Pair;
import org.eobjects.analyzer.beans.StringAnalyzer;
import org.eobjects.analyzer.beans.transform.ConcatenatorTransformer;
import org.eobjects.analyzer.beans.transform.TokenizerTransformer;
import org.eobjects.analyzer.beans.valuedist.ValueDistributionAnalyzer;
import org.eobjects.analyzer.beans.writers.InsertIntoTableAnalyzer;
import org.eobjects.analyzer.configuration.AnalyzerBeansConfiguration;
import org.eobjects.analyzer.configuration.AnalyzerBeansConfigurationImpl;
import org.eobjects.analyzer.connection.Datastore;
import org.eobjects.analyzer.connection.DatastoreCatalog;
import org.eobjects.analyzer.connection.DatastoreCatalogImpl;
import org.eobjects.analyzer.connection.PojoDatastore;
import org.eobjects.analyzer.descriptors.Descriptors;
import org.eobjects.analyzer.descriptors.SimpleDescriptorProvider;
import org.eobjects.analyzer.job.AnalysisJob;
import org.eobjects.analyzer.job.builder.AnalysisJobBuilder;
import org.eobjects.analyzer.job.builder.AnalyzerJobBuilder;
import org.eobjects.analyzer.job.builder.TransformerJobBuilder;
import org.eobjects.hadoopdatacleaner.configuration.ConfigurationSerializer;
import org.eobjects.hadoopdatacleaner.tools.HBaseTool;
import org.apache.metamodel.pojo.ArrayTableDataProvider;
import org.apache.metamodel.pojo.TableDataProvider;
import org.apache.metamodel.util.SimpleTableDef;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HBaseTableReducerTest {

    ReduceDriver<Text, SortedMapWritable, NullWritable, Mutation> reduceDriver;

    @SuppressWarnings("unused")
    private static final Logger logger = LoggerFactory.getLogger(HBaseTableReducer.class);

    @Before
    public void setUp() {
        AnalyzerBeansConfiguration analyzerBeansConfiguration = buildAnalyzerBeansConfiguration();
        AnalysisJob analysisJob = buildAnalysisJob(analyzerBeansConfiguration);
        String analyzerBeansConfigurationDatastores = ConfigurationSerializer
                .serializeAnalyzerBeansConfigurationDataStores(analyzerBeansConfiguration);
        String analysisJobXml = ConfigurationSerializer.serializeAnalysisJobToXml(analyzerBeansConfiguration,
                analysisJob);
        HBaseTableReducer hBaseTableReducer = new HBaseTableReducer();
        reduceDriver = ReduceDriver.newReduceDriver(hBaseTableReducer);
        reduceDriver.getConfiguration().set("io.serializations",
                "org.apache.hadoop.hbase.mapreduce.ResultSerialization,"
                        + "org.apache.hadoop.hbase.mapreduce.KeyValueSerialization,"
                        + "org.apache.hadoop.hbase.mapreduce.MutationSerialization,"
                        + "org.apache.hadoop.io.serializer.JavaSerialization,"
                        + "org.apache.hadoop.io.serializer.WritableSerialization");
        reduceDriver.getConfiguration().set(HBaseTool.ANALYZER_BEANS_CONFIGURATION_DATASTORES_KEY,
                analyzerBeansConfigurationDatastores);
        reduceDriver.getConfiguration().set(HBaseTool.ANALYSIS_JOB_XML_KEY, analysisJobXml);
    }

    @Test
    public void testReducer() throws IOException {
        List<SortedMapWritable> inputRows = new ArrayList<SortedMapWritable>();
        SortedMapWritable inputRow = new SortedMapWritable();
        inputRow.put(new Text("mainFamily:country_name"), new Text("Denmark"));
        inputRow.put(new Text("mainFamily:iso2"), new Text("DK"));
        inputRow.put(new Text("mainFamily:iso2_iso3"), new Text("DK_DNK"));
        inputRow.put(new Text("mainFamily:iso3"), new Text("DNK"));
        inputRows.add(inputRow);

        inputRow = new SortedMapWritable();
        inputRow.put(new Text("mainFamily:country_name"), new Text("Poland"));
        inputRow.put(new Text("mainFamily:iso2"), new Text("PL"));
        inputRow.put(new Text("mainFamily:iso2_iso3"), new Text("PL_POL"));
        inputRow.put(new Text("mainFamily:iso3"), new Text("POL"));
        inputRows.add(inputRow);

        String inputAnalyzerKey1 = "Value distribution (mainFamily:country_name)";

        reduceDriver.withInput(new Text(inputAnalyzerKey1), inputRows);
        List<Pair<NullWritable, Mutation>> actualOutputs = reduceDriver.run();
        Assert.assertEquals(2, actualOutputs.size());

        Pair<NullWritable, Mutation> actualOutput1 = actualOutputs.get(0);
        Put actualPut1 = (Put) actualOutput1.getSecond();
        List<Cell> keyValues = actualPut1.get(Bytes.toBytes("mainFamily"), Bytes.toBytes("country_name"));
        Assert.assertEquals(1, keyValues.size());
        Cell cell = keyValues.get(0);
        System.out.println("Value: " + Bytes.toString(CellUtil.cloneValue(cell)));
        Assert.assertEquals("Denmark", Bytes.toString(CellUtil.cloneValue(cell)));

        Pair<NullWritable, Mutation> actualOutput2 = actualOutputs.get(1);
        Put actualPut2 = (Put) actualOutput2.getSecond();
        keyValues = actualPut2.get(Bytes.toBytes("mainFamily"), Bytes.toBytes("country_name"));
        Assert.assertEquals(1, keyValues.size());
        cell = keyValues.get(0);
        System.out.println("Value: " + Bytes.toString(CellUtil.cloneValue(cell)));
        Assert.assertEquals("Poland", Bytes.toString(CellUtil.cloneValue(cell)));
    }

    public static AnalyzerBeansConfiguration buildAnalyzerBeansConfiguration() {
        List<TableDataProvider<?>> tableDataProviders = new ArrayList<TableDataProvider<?>>();
        SimpleTableDef tableDef1 = new SimpleTableDef("countrycodes",
                new String[] { "mainFamily:country_name", "mainFamily:iso2", "mainFamily:iso3" });
        SimpleTableDef tableDef2 = new SimpleTableDef("countrycodes_output",
                new String[] { "mainFamily:country_name", "mainFamily:iso2", "mainFamily:iso3" });
        tableDataProviders.add(new ArrayTableDataProvider(tableDef1, new ArrayList<Object[]>()));
        tableDataProviders.add(new ArrayTableDataProvider(tableDef2, new ArrayList<Object[]>()));
        Datastore datastore = new PojoDatastore("countrycodes_hbase", "countrycodes_schema", tableDataProviders);

        DatastoreCatalog datastoreCatalog = new DatastoreCatalogImpl(datastore);

        SimpleDescriptorProvider descriptorProvider = new SimpleDescriptorProvider(true);
        descriptorProvider.addTransformerBeanDescriptor(Descriptors.ofTransformer(ConcatenatorTransformer.class));
        descriptorProvider.addTransformerBeanDescriptor(Descriptors.ofTransformer(TokenizerTransformer.class));
        descriptorProvider.addAnalyzerBeanDescriptor(Descriptors.ofAnalyzer(InsertIntoTableAnalyzer.class));
        descriptorProvider.addAnalyzerBeanDescriptor(Descriptors.ofAnalyzer(StringAnalyzer.class));

        return new AnalyzerBeansConfigurationImpl().replace(datastoreCatalog).replace(descriptorProvider);
    }

    public static AnalysisJob buildAnalysisJob(AnalyzerBeansConfiguration configuration) {
        AnalysisJobBuilder ajb = new AnalysisJobBuilder(configuration);
        try {
            ajb.setDatastore("countrycodes_hbase");

            ajb.addSourceColumns("countrycodes_schema.countrycodes.mainFamily:country_name",
                    "countrycodes_schema.countrycodes.mainFamily:iso2",
                    "countrycodes_schema.countrycodes.mainFamily:iso3");

            TransformerJobBuilder<ConcatenatorTransformer> concatenator = ajb
                    .addTransformer(ConcatenatorTransformer.class);
            concatenator.addInputColumns(ajb.getSourceColumnByName("mainFamily:iso2"));
            concatenator.addInputColumns(ajb.getSourceColumnByName("mainFamily:iso3"));
            concatenator.setConfiguredProperty("Separator", "_");
            concatenator.getOutputColumns().get(0).setName("mainFamily:iso2_iso3");

            AnalyzerJobBuilder<ValueDistributionAnalyzer> valueDistributionAnalyzer = ajb
                    .addAnalyzer(ValueDistributionAnalyzer.class);
            valueDistributionAnalyzer.addInputColumn(ajb.getSourceColumnByName("mainFamily:country_name"));

            AnalyzerJobBuilder<ValueDistributionAnalyzer> valueDistributionAnalyzer2 = ajb
                    .addAnalyzer(ValueDistributionAnalyzer.class);
            valueDistributionAnalyzer2.addInputColumn(ajb.getSourceColumnByName("mainFamily:iso2"));

            return ajb.toAnalysisJob();
        } finally {
            ajb.close();
        }
    }

}