org.apache.blur.mapreduce.lib.BlurOutputFormatTest.java Source code

Introduction

Here is the source code for org.apache.blur.mapreduce.lib.BlurOutputFormatTest.java
Source

package org.apache.blur.mapreduce.lib;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.Collection;
import java.util.TreeSet;

import org.apache.blur.MiniCluster;
import org.apache.blur.server.TableContext;
import org.apache.blur.store.buffer.BufferStore;
import org.apache.blur.store.hdfs.HdfsDirectory;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.blur.utils.JavaHome;
import org.apache.blur.utils.ShardUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.lucene.index.DirectoryReader;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

public class BlurOutputFormatTest {

    private static Configuration _conf = new Configuration();
    private static FileSystem _fileSystem;
    private static MiniCluster _miniCluster;

    private static Path _root;

    @BeforeClass
    public static void setupTest() throws Exception {
        JavaHome.checkJavaHome();
        File file = new File("./target/tmp/BlurOutputFormatTest_tmp");
        String pathStr = file.getAbsoluteFile().toURI().toString();
        String hdfsPath = pathStr + "/hdfs";
        System.setProperty("test.build.data", hdfsPath);
        System.setProperty("hadoop.log.dir", pathStr + "/hadoop_log");

        _miniCluster = new MiniCluster();
        _miniCluster.startDfs(hdfsPath);
        _fileSystem = _miniCluster.getFileSystem();
        _root = new Path(_fileSystem.getUri() + "/testroot");
        _miniCluster.startMrMiniCluster();
        _conf = _miniCluster.getMRConfiguration();

        BufferStore.initNewBuffer(128, 128 * 128);
    }

    @AfterClass
    public static void teardown() throws IOException {
        if (_miniCluster != null) {
            _miniCluster.stopMrMiniCluster();
            _miniCluster.shutdownDfs();
        }
        rm(new File("build"));
    }

    private static void rm(File file) {
        if (!file.exists()) {
            return;
        }
        if (file.isDirectory()) {
            for (File f : file.listFiles()) {
                rm(f);
            }
        }
        file.delete();
    }

    @Before
    public void setup() {
        TableContext.clear();
    }

    @Test
    public void testBlurOutputFormat() throws IOException, InterruptedException, ClassNotFoundException {
        Path input = getInDir();
        Path output = getOutDir();
        _fileSystem.delete(input, true);
        _fileSystem.delete(output, true);
        writeRecordsFile(new Path(input, "part1"), 1, 1, 1, 1, "cf1");
        writeRecordsFile(new Path(input, "part2"), 1, 1, 2, 1, "cf1");

        Job job = Job.getInstance(_conf, "blur index");
        job.setJarByClass(BlurOutputFormatTest.class);
        job.setMapperClass(CsvBlurMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        FileInputFormat.addInputPath(job, input);
        CsvBlurMapper.addColumns(job, "cf1", "col");

        Path tablePath = new Path(new Path(_root, "table"), "test");

        TableDescriptor tableDescriptor = new TableDescriptor();
        tableDescriptor.setShardCount(1);
        tableDescriptor.setTableUri(tablePath.toString());
        tableDescriptor.setName("test");

        createShardDirectories(tablePath, 1);

        BlurOutputFormat.setupJob(job, tableDescriptor);
        BlurOutputFormat.setOutputPath(job, output);

        assertTrue(job.waitForCompletion(true));
        Counters ctrs = job.getCounters();
        System.out.println("Counters: " + ctrs);

        Path path = new Path(output, ShardUtil.getShardName(0));
        dump(path, _conf);
        Collection<Path> commitedTasks = getCommitedTasks(path);
        assertEquals(1, commitedTasks.size());
        DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next()));
        assertEquals(2, reader.numDocs());
        reader.close();
    }

    private Path getOutDir() {
        return new Path(_root, "out");
    }

    private Path getInDir() {
        return new Path(_root, "in");
    }

    private void dump(Path path, Configuration conf) throws IOException {
        FileSystem fileSystem = path.getFileSystem(conf);
        System.out.println(path);
        if (!fileSystem.isFile(path)) {
            FileStatus[] listStatus = fileSystem.listStatus(path);
            for (FileStatus fileStatus : listStatus) {
                dump(fileStatus.getPath(), conf);
            }
        }
    }

    private Collection<Path> getCommitedTasks(Path path) throws IOException {
        Collection<Path> result = new TreeSet<Path>();
        FileSystem fileSystem = path.getFileSystem(_conf);
        FileStatus[] listStatus = fileSystem.listStatus(path);
        for (FileStatus fileStatus : listStatus) {
            Path p = fileStatus.getPath();
            if (fileStatus.isDir() && p.getName().endsWith(".commit")) {
                result.add(p);
            }
        }
        return result;
    }

    @Test
    public void testBlurOutputFormatOverFlowTest()
            throws IOException, InterruptedException, ClassNotFoundException {
        Path input = getInDir();
        Path output = getOutDir();
        _fileSystem.delete(input, true);
        _fileSystem.delete(output, true);
        // 1500 * 50 = 75,000
        writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
        // 100 * 50 = 5,000
        writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1");

        Job job = Job.getInstance(_conf, "blur index");
        job.setJarByClass(BlurOutputFormatTest.class);
        job.setMapperClass(CsvBlurMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        FileInputFormat.addInputPath(job, input);
        CsvBlurMapper.addColumns(job, "cf1", "col");

        Path tablePath = new Path(new Path(_root, "table"), "test");

        TableDescriptor tableDescriptor = new TableDescriptor();
        tableDescriptor.setShardCount(1);
        tableDescriptor.setTableUri(tablePath.toString());
        tableDescriptor.setName("test");

        createShardDirectories(tablePath, 1);

        BlurOutputFormat.setupJob(job, tableDescriptor);
        BlurOutputFormat.setOutputPath(job, output);
        BlurOutputFormat.setIndexLocally(job, true);
        BlurOutputFormat.setOptimizeInFlight(job, false);

        assertTrue(job.waitForCompletion(true));
        Counters ctrs = job.getCounters();
        System.out.println("Counters: " + ctrs);

        Path path = new Path(output, ShardUtil.getShardName(0));
        Collection<Path> commitedTasks = getCommitedTasks(path);
        assertEquals(1, commitedTasks.size());

        DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next()));
        assertEquals(80000, reader.numDocs());
        reader.close();
    }

    @Test
    public void testBlurOutputFormatOverFlowMultipleReducersTest()
            throws IOException, InterruptedException, ClassNotFoundException {
        Path input = getInDir();
        Path output = getOutDir();
        _fileSystem.delete(input, true);
        _fileSystem.delete(output, true);
        // 1500 * 50 = 75,000
        writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
        // 100 * 50 = 5,000
        writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1");

        Job job = Job.getInstance(_conf, "blur index");
        job.setJarByClass(BlurOutputFormatTest.class);
        job.setMapperClass(CsvBlurMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        FileInputFormat.addInputPath(job, input);
        CsvBlurMapper.addColumns(job, "cf1", "col");

        Path tablePath = new Path(new Path(_root, "table"), "test");

        TableDescriptor tableDescriptor = new TableDescriptor();
        tableDescriptor.setShardCount(2);
        tableDescriptor.setTableUri(tablePath.toString());
        tableDescriptor.setName("test");

        createShardDirectories(output, 2);

        BlurOutputFormat.setupJob(job, tableDescriptor);
        BlurOutputFormat.setOutputPath(job, output);
        BlurOutputFormat.setIndexLocally(job, false);
        BlurOutputFormat.setDocumentBufferStrategy(job, DocumentBufferStrategyHeapSize.class);
        BlurOutputFormat.setMaxDocumentBufferHeapSize(job, 128 * 1024);

        assertTrue(job.waitForCompletion(true));
        Counters ctrs = job.getCounters();
        System.out.println("Counters: " + ctrs);

        long total = 0;
        for (int i = 0; i < tableDescriptor.getShardCount(); i++) {
            Path path = new Path(output, ShardUtil.getShardName(i));
            Collection<Path> commitedTasks = getCommitedTasks(path);
            assertEquals(1, commitedTasks.size());

            DirectoryReader reader = DirectoryReader
                    .open(new HdfsDirectory(_conf, commitedTasks.iterator().next()));
            total += reader.numDocs();
            reader.close();
        }
        assertEquals(80000, total);

    }

    @Test
    public void testBlurOutputFormatOverFlowMultipleReducersWithReduceMultiplierTest()
            throws IOException, InterruptedException, ClassNotFoundException {
        Path input = getInDir();
        Path output = getOutDir();
        _fileSystem.delete(input, true);
        _fileSystem.delete(output, true);

        // 1500 * 50 = 75,000
        writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
        // 100 * 50 = 5,000
        writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1");

        Job job = Job.getInstance(_conf, "blur index");
        job.setJarByClass(BlurOutputFormatTest.class);
        job.setMapperClass(CsvBlurMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        FileInputFormat.addInputPath(job, input);
        CsvBlurMapper.addColumns(job, "cf1", "col");

        Path tablePath = new Path(new Path(_root, "table"), "test");

        TableDescriptor tableDescriptor = new TableDescriptor();
        tableDescriptor.setShardCount(7);
        tableDescriptor.setTableUri(tablePath.toString());
        tableDescriptor.setName("test");

        createShardDirectories(output, 7);

        BlurOutputFormat.setupJob(job, tableDescriptor);
        BlurOutputFormat.setOutputPath(job, output);
        int multiple = 2;
        BlurOutputFormat.setReducerMultiplier(job, multiple);

        assertTrue(job.waitForCompletion(true));
        Counters ctrs = job.getCounters();
        System.out.println("Counters: " + ctrs);

        long total = 0;
        for (int i = 0; i < tableDescriptor.getShardCount(); i++) {
            Path path = new Path(output, ShardUtil.getShardName(i));
            Collection<Path> commitedTasks = getCommitedTasks(path);
            assertTrue(commitedTasks.size() >= multiple);
            for (Path p : commitedTasks) {
                DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, p));
                total += reader.numDocs();
                reader.close();
            }
        }
        assertEquals(80000, total);

    }

    @Test(expected = IllegalArgumentException.class)
    public void testBlurOutputFormatValidateReducerCount()
            throws IOException, InterruptedException, ClassNotFoundException {

        Path input = getInDir();
        Path output = getOutDir();
        _fileSystem.delete(input, true);
        _fileSystem.delete(output, true);
        writeRecordsFile(new Path(input, "part1"), 1, 1, 1, 1, "cf1");
        writeRecordsFile(new Path(input, "part2"), 1, 1, 2, 1, "cf1");

        Job job = Job.getInstance(_conf, "blur index");
        job.setJarByClass(BlurOutputFormatTest.class);
        job.setMapperClass(CsvBlurMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        FileInputFormat.addInputPath(job, input);
        CsvBlurMapper.addColumns(job, "cf1", "col");

        Path tablePath = new Path(new Path(_root, "table"), "test");

        TableDescriptor tableDescriptor = new TableDescriptor();
        tableDescriptor.setShardCount(1);
        tableDescriptor.setTableUri(tablePath.toString());
        tableDescriptor.setName("test");

        createShardDirectories(getOutDir(), 1);

        BlurOutputFormat.setupJob(job, tableDescriptor);
        BlurOutputFormat.setOutputPath(job, output);
        BlurOutputFormat.setReducerMultiplier(job, 2);
        job.setNumReduceTasks(4);
        job.submit();

    }

    // @TODO this test to fail sometimes due to issues in the MR MiniCluster
    // @Test
    public void testBlurOutputFormatCleanupDuringJobKillTest()
            throws IOException, InterruptedException, ClassNotFoundException {
        Path input = getInDir();
        Path output = getOutDir();
        _fileSystem.delete(input, true);
        _fileSystem.delete(output, true);
        // 1500 * 50 = 75,000
        writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
        // 100 * 5000 = 500,000
        writeRecordsFile(new Path(input, "part2"), 1, 5000, 2000, 100, "cf1");

        Job job = Job.getInstance(_conf, "blur index");
        job.setJarByClass(BlurOutputFormatTest.class);
        job.setMapperClass(CsvBlurMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        FileInputFormat.addInputPath(job, input);
        CsvBlurMapper.addColumns(job, "cf1", "col");

        Path tablePath = new Path(new Path(_root, "table"), "test");

        TableDescriptor tableDescriptor = new TableDescriptor();
        tableDescriptor.setShardCount(2);
        tableDescriptor.setTableUri(tablePath.toString());
        tableDescriptor.setName("test");

        createShardDirectories(getOutDir(), 2);

        BlurOutputFormat.setupJob(job, tableDescriptor);
        BlurOutputFormat.setOutputPath(job, output);
        BlurOutputFormat.setIndexLocally(job, false);

        job.submit();
        boolean killCalled = false;
        while (!job.isComplete()) {
            Thread.sleep(1000);
            System.out.printf("Killed [" + killCalled + "] Map [%f] Reduce [%f]%n", job.mapProgress() * 100,
                    job.reduceProgress() * 100);
            if (job.reduceProgress() > 0.7 && !killCalled) {
                job.killJob();
                killCalled = true;
            }
        }

        assertFalse(job.isSuccessful());

        for (int i = 0; i < tableDescriptor.getShardCount(); i++) {
            Path path = new Path(output, ShardUtil.getShardName(i));
            FileSystem fileSystem = path.getFileSystem(job.getConfiguration());
            FileStatus[] listStatus = fileSystem.listStatus(path);
            assertEquals(toString(listStatus), 0, listStatus.length);
        }
    }

    private String toString(FileStatus[] listStatus) {
        if (listStatus == null || listStatus.length == 0) {
            return "";
        }
        String s = "";
        for (FileStatus fileStatus : listStatus) {
            if (s.length() > 0) {
                s += ",";
            }
            s += fileStatus.getPath();
        }
        return s;
    }

    public static String readFile(Path file) throws IOException {
        DataInputStream f = _fileSystem.open(file);
        BufferedReader b = new BufferedReader(new InputStreamReader(f));
        StringBuilder result = new StringBuilder();
        String line = b.readLine();
        while (line != null) {
            result.append(line);
            result.append('\n');
            line = b.readLine();
        }
        b.close();
        return result.toString();
    }

    private Path writeRecordsFile(Path file, int starintgRowId, int numberOfRows, int startRecordId,
            int numberOfRecords, String family) throws IOException {
        _fileSystem.delete(file, false);
        DataOutputStream f = _fileSystem.create(file);
        PrintWriter writer = new PrintWriter(f);
        for (int row = 0; row < numberOfRows; row++) {
            for (int record = 0; record < numberOfRecords; record++) {
                writer.println(getRecord(row + starintgRowId, record + startRecordId, family));
            }
        }
        writer.close();
        return file;
    }

    private void createShardDirectories(Path outDir, int shardCount) throws IOException {
        _fileSystem.mkdirs(outDir);
        for (int i = 0; i < shardCount; i++) {
            _fileSystem.mkdirs(new Path(outDir, ShardUtil.getShardName(i)));
        }
    }

    private String getRecord(int rowId, int recordId, String family) {
        return rowId + "," + recordId + "," + family + ",valuetoindex";
    }
}