com.cloudera.cdk.morphline.hadoop.rcfile.ReadRCFileTest.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.morphline.hadoop.rcfile.ReadRCFileTest.java

Source

/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.morphline.hadoop.rcfile;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import com.cloudera.cdk.morphline.api.AbstractMorphlineTest;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.Fields;
import com.google.common.collect.Lists;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;

public class ReadRCFileTest extends AbstractMorphlineTest {
    private static MiniDFSCluster cluster = null;
    private static FileSystem dfs = null;
    private Path testDirectory;
    private static final int NUM_RECORDS = 5;
    private static final int NUM_COLUMNS = 5;

    @BeforeClass
    public static void setupFS() throws IOException {
        final Configuration conf = new Configuration();
        cluster = new MiniDFSCluster.Builder(conf).build();
        dfs = cluster.getFileSystem();
    }

    @AfterClass
    public static void teardownFS() throws IOException {
        dfs = null;
        if (cluster != null) {
            cluster.shutdown();
            cluster = null;
        }
    }

    @Before
    public void setUp() throws Exception {
        super.setUp();
        testDirectory = new Path(Files.createTempDir().getAbsolutePath());
    }

    @After
    public void tearDown() throws Exception {
        super.tearDown();
        dfs.delete(testDirectory, true);
    }

    @Test
    public void testRCFileRowWise() throws Exception {
        morphline = createMorphline("test-morphlines/rcFileMorphlineRow");
        String rcFileName = "testRCFileRowWise.rc";
        List<Record> expected = setupRCFile(rcFileName, NUM_RECORDS, NUM_COLUMNS, true);
        Path inputFile = dfs.makeQualified(new Path(testDirectory, rcFileName));
        Record input = new Record();
        input.put(Fields.ATTACHMENT_NAME, inputFile.toString());
        input.put(Fields.ATTACHMENT_BODY, readPath(inputFile));
        startSession();
        assertEquals(1, collector.getNumStartEvents());
        assertTrue(morphline.process(input));
        assertTrue(areFieldsEqual(expected, collector.getRecords(), NUM_COLUMNS, NUM_RECORDS, true));
    }

    @Test
    public void testRCFileColumnWise() throws Exception {
        morphline = createMorphline("test-morphlines/rcFileMorphlineColumn");
        String rcFileName = "testRCFileColumnWise.rc";
        List<Record> expected = setupRCFile(rcFileName, NUM_RECORDS, NUM_COLUMNS, false);
        Path inputFile = dfs.makeQualified(new Path(testDirectory, rcFileName));
        Record input = new Record();
        input.put(Fields.ATTACHMENT_NAME, inputFile.toString());
        input.put(Fields.ATTACHMENT_BODY, readPath(inputFile));
        startSession();
        assertEquals(1, collector.getNumStartEvents());
        assertTrue(morphline.process(input));
        assertTrue(areFieldsEqual(expected, collector.getRecords(), NUM_COLUMNS, NUM_RECORDS, false));
    }

    private void createRCFile(final String fileName, final int numRecords, final int maxColumns)
            throws IOException {
        // Write the sequence file
        SequenceFile.Metadata metadata = getMetadataForRCFile();
        Configuration conf = new Configuration();
        conf.set(RCFile.COLUMN_NUMBER_CONF_STR, String.valueOf(maxColumns));
        Path inputFile = dfs.makeQualified(new Path(testDirectory, fileName));
        RCFile.Writer rcFileWriter = new RCFile.Writer(dfs, conf, inputFile, null, metadata, null);
        for (int row = 0; row < numRecords; row++) {
            BytesRefArrayWritable dataWrite = new BytesRefArrayWritable(maxColumns);
            dataWrite.resetValid(maxColumns);
            for (int column = 0; column < maxColumns; column++) {
                Text sampleText = new Text("ROW-NUM:" + row + ", COLUMN-NUM:" + column);
                ByteArrayDataOutput dataOutput = ByteStreams.newDataOutput();
                sampleText.write(dataOutput);
                dataWrite.set(column, new BytesRefWritable(dataOutput.toByteArray()));
            }
            rcFileWriter.append(dataWrite);
        }
        rcFileWriter.close();
    }

    private InputStream readPath(final Path inputFile) throws IOException {
        FileSystem fs = inputFile.getFileSystem(new Configuration());
        return fs.open(inputFile);
    }

    private List<Record> setupRCFile(final String fileName, final int numRecords, final int maxColumns,
            final boolean rowWise) throws IOException {
        createRCFile(fileName, numRecords, maxColumns);
        List<Record> expected = Lists.newArrayList();
        if (rowWise) {
            // Row wise expected records
            for (int row = 0; row < numRecords; row++) {
                Record record = new Record();
                for (int column = 0; column < maxColumns; column++) {
                    Text sampleText = new Text("ROW-NUM:" + row + ", COLUMN-NUM:" + column);
                    record.put("field" + (column + 1), sampleText);
                }
                expected.add(record);
            }
        } else {
            // Column wise expected records
            for (int column = 0; column < maxColumns; column++) {
                for (int row = 0; row < numRecords; row++) {
                    Record record = new Record();
                    Text sampleText = new Text("ROW-NUM:" + row + ", COLUMN-NUM:" + column);
                    record.put("field" + (column + 1), sampleText);
                    expected.add(record);
                }
            }
        }
        return expected;
    }

    private SequenceFile.Metadata getMetadataForRCFile() {
        return RCFile.createMetadata(new Text("metaField"), new Text("metaValue"));
    }

    private boolean areFieldsEqual(List<Record> expected, List<Record> actual, final int columnSize,
            final int rowSize, final boolean rowWiseCheck) {
        if (expected.size() != actual.size()) {
            return false;
        }

        if (rowWiseCheck) {
            for (int i = 0; i < actual.size(); i++) {
                Record currentExpected = expected.get(i);
                Record currentActual = actual.get(i);
                if (!areRecordColumnsEqual(currentActual, currentExpected, columnSize)) {
                    return false;
                }
            }
        } else {
            for (int i = 0; i < columnSize; i++) {
                String fieldName = "field" + (i + 1);
                for (int j = 0; j < rowSize; j++) {
                    Record currentExpected = expected.get((i * rowSize) + j);
                    Record currentActual = actual.get((i * rowSize) + j);
                    if (!isRecordColumnEqual(currentActual, currentExpected, fieldName)) {
                        return false;
                    }
                }
            }
        }
        return true;
    }

    private boolean areRecordColumnsEqual(final Record actual, final Record expected, final int columnSize) {
        for (int i = 0; i < columnSize; i++) {
            String fieldName = "field" + (i + 1);
            if (!isRecordColumnEqual(actual, expected, fieldName)) {
                return false;
            }
        }
        return true;
    }

    private boolean isRecordColumnEqual(final Record actual, final Record expected, final String fieldName) {
        return actual.get(fieldName).equals(expected.get(fieldName));
    }

}