com.hazelcast.jet.hadoop.impl.ReadHdfsPTest.java Source code

Introduction

Here is the source code for com.hazelcast.jet.hadoop.impl.ReadHdfsPTest.java
Source

/*
 * Copyright (c) 2008-2019, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.hadoop.impl;

import com.hazelcast.core.IList;
import com.hazelcast.jet.JetInstance;
import com.hazelcast.jet.Util;
import com.hazelcast.jet.core.DAG;
import com.hazelcast.jet.core.Vertex;
import com.hazelcast.jet.function.BiFunctionEx;
import com.hazelcast.jet.impl.util.ExceptionUtil;
import com.hazelcast.test.HazelcastParametersRunnerFactory;
import com.hazelcast.test.annotation.ParallelTest;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.SequenceFile.Writer.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.junit.Before;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Future;
import java.util.stream.IntStream;

import static com.hazelcast.jet.core.Edge.between;
import static com.hazelcast.jet.core.processor.SinkProcessors.writeListP;
import static com.hazelcast.jet.hadoop.HdfsProcessors.readHdfsP;
import static com.hazelcast.jet.impl.util.Util.uncheckRun;
import static java.lang.Integer.parseInt;
import static java.util.stream.IntStream.range;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

@RunWith(Parameterized.class)
@Parameterized.UseParametersRunnerFactory(HazelcastParametersRunnerFactory.class)
@Category(ParallelTest.class)
public class ReadHdfsPTest extends HdfsTestSupport {

    private static final String[] ENTRIES = range(0, 4).mapToObj(i -> "key-" + i + " value-" + i + '\n')
            .toArray(String[]::new);

    @Parameterized.Parameter
    public Class<? extends InputFormat> inputFormatClass;

    @Parameterized.Parameter(1)
    public EMapperType mapperType;
    private JobConf jobConf;
    private JetInstance instance;
    private Set<Path> paths = new HashSet<>();

    @Parameterized.Parameters(name = "inputFormatClass={0}, mapper={1}")
    public static Collection<Object[]> parameters() {
        return Arrays.asList(new Object[] { TextInputFormat.class, EMapperType.DEFAULT },
                new Object[] { TextInputFormat.class, EMapperType.CUSTOM },
                new Object[] { TextInputFormat.class, EMapperType.CUSTOM_WITH_NULLS },
                new Object[] { SequenceFileInputFormat.class, EMapperType.DEFAULT },
                new Object[] { SequenceFileInputFormat.class, EMapperType.CUSTOM },
                new Object[] { SequenceFileInputFormat.class, EMapperType.CUSTOM_WITH_NULLS });
    }

    @Before
    public void setup() throws IOException {
        instance = createJetMember();
        jobConf = new JobConf();
        jobConf.setInputFormat(inputFormatClass);

        writeToFile();
        for (Path path : paths) {
            FileInputFormat.addInputPath(jobConf, path);
        }
    }

    @Test
    public void testReadHdfs() {
        DAG dag = new DAG();

        Vertex source = dag.newVertex("source", readHdfsP(jobConf, mapperType.mapper)).localParallelism(4);
        Vertex sink = dag.newVertex("sink", writeListP("sink")).localParallelism(1);
        dag.edge(between(source, sink));

        Future<Void> future = instance.newJob(dag).getFuture();
        assertCompletesEventually(future);

        IList list = instance.getList("sink");
        assertEquals(expectedSinkSize(), list.size());
        assertTrue(list.get(0).toString().contains("value"));
    }

    private int expectedSinkSize() {
        return mapperType == EMapperType.CUSTOM_WITH_NULLS ? 8 : 16;
    }

    private void writeToFile() throws IOException {
        Configuration conf = new Configuration();
        LocalFileSystem local = FileSystem.getLocal(conf);

        IntStream.range(0, 4).mapToObj(i -> createPath()).forEach(path -> uncheckRun(() -> {
            paths.add(path);
            if (SequenceFileInputFormat.class.equals(inputFormatClass)) {
                writeToSequenceFile(conf, path);
            } else {
                writeToTextFile(local, path);
            }
        }));
    }

    private static void writeToTextFile(LocalFileSystem local, Path path) throws IOException {
        try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(local.create(path)))) {
            for (String value : ENTRIES) {
                writer.write(value);
                writer.flush();
            }
        }
    }

    private static void writeToSequenceFile(Configuration conf, Path path) throws IOException {
        IntWritable key = new IntWritable();
        Text value = new Text();
        Option fileOption = Writer.file(path);
        Option keyClassOption = Writer.keyClass(key.getClass());
        Option valueClassOption = Writer.valueClass(value.getClass());
        try (Writer writer = SequenceFile.createWriter(conf, fileOption, keyClassOption, valueClassOption)) {
            for (int i = 0; i < ENTRIES.length; i++) {
                key.set(i);
                value.set(ENTRIES[i]);
                writer.append(key, value);
            }
        }
    }

    private Path createPath() {
        try {
            String fileName = Files.createTempFile(getClass().getName(), null).toString();
            return new Path(fileName);
        } catch (IOException e) {
            throw ExceptionUtil.sneakyThrow(e);
        }
    }

    private enum EMapperType {
        DEFAULT(Util::entry), CUSTOM((k, v) -> v.toString()), CUSTOM_WITH_NULLS(
                (k, v) -> parseInt(v.toString().substring(4, 5)) % 2 == 0 ? v.toString() : null);

        private final BiFunctionEx<?, Text, ?> mapper;

        EMapperType(BiFunctionEx<?, Text, ?> mapper) {
            this.mapper = mapper;
        }
    }
}