io.druid.segment.loading.HdfsDataSegmentFinderTest.java Source code

Java tutorial

Introduction

Here is the source code for io.druid.segment.loading.HdfsDataSegmentFinderTest.java

Source

/*
 * Licensed to Metamarkets Group Inc. (Metamarkets) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Metamarkets licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.druid.segment.loading;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.druid.jackson.DefaultObjectMapper;
import io.druid.storage.hdfs.HdfsDataSegmentFinder;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.NumberedShardSpec;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.joda.time.Interval;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.Set;

/**
 */
public class HdfsDataSegmentFinderTest {

    private static final ObjectMapper mapper = new DefaultObjectMapper();
    private static final String DESCRIPTOR_JSON = "descriptor.json";
    private static final String INDEX_ZIP = "index.zip";
    private static final DataSegment SEGMENT_1 = DataSegment.builder().dataSource("wikipedia")
            .interval(new Interval("2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z"))
            .version("2015-10-21T22:07:57.074Z")
            .loadSpec(ImmutableMap.<String, Object>of("type", "hdfs", "path",
                    "hdfs://abc.com:1234/somewhere/index.zip"))
            .dimensions(ImmutableList.of("language", "page")).metrics(ImmutableList.of("count")).build();

    private static final DataSegment SEGMENT_2 = DataSegment.builder(SEGMENT_1)
            .interval(new Interval("2013-09-01T00:00:00.000Z/2013-09-02T00:00:00.000Z")).build();

    private static final DataSegment SEGMENT_3 = DataSegment.builder(SEGMENT_1)
            .interval(new Interval("2013-09-02T00:00:00.000Z/2013-09-03T00:00:00.000Z"))
            .version("2015-10-22T22:07:57.074Z").build();

    private static final DataSegment SEGMENT_4_0 = DataSegment.builder(SEGMENT_1)
            .interval(new Interval("2013-09-02T00:00:00.000Z/2013-09-03T00:00:00.000Z"))
            .shardSpec(new NumberedShardSpec(0, 2)).build();

    private static final DataSegment SEGMENT_4_1 = DataSegment.builder(SEGMENT_1)
            .interval(new Interval("2013-09-02T00:00:00.000Z/2013-09-03T00:00:00.000Z"))
            .shardSpec(new NumberedShardSpec(1, 2)).build();

    private static MiniDFSCluster miniCluster;
    private static File hdfsTmpDir;
    private static URI uriBase;
    private static Configuration conf;
    private static FileSystem fs;

    private Path dataSourceDir;
    private Path descriptor1;
    private Path descriptor2;
    private Path descriptor3;
    private Path descriptor4_0;
    private Path descriptor4_1;
    private Path indexZip1;
    private Path indexZip2;
    private Path indexZip3;
    private Path indexZip4_0;
    private Path indexZip4_1;

    @BeforeClass
    public static void setupStatic() throws IOException {
        mapper.registerSubtypes(new NamedType(NumberedShardSpec.class, "numbered"));

        hdfsTmpDir = File.createTempFile("hdfsDataSource", "dir");
        hdfsTmpDir.deleteOnExit();
        if (!hdfsTmpDir.delete()) {
            throw new IOException(String.format("Unable to delete hdfsTmpDir [%s]", hdfsTmpDir.getAbsolutePath()));
        }
        conf = new Configuration(true);
        conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsTmpDir.getAbsolutePath());
        miniCluster = new MiniDFSCluster.Builder(conf).build();
        uriBase = miniCluster.getURI();
        fs = miniCluster.getFileSystem();
    }

    @AfterClass
    public static void tearDownStatic() {
        if (miniCluster != null) {
            miniCluster.shutdown(true);
        }
    }

    @Before
    public void setUp() throws IOException {
        dataSourceDir = new Path(new Path(uriBase), "/usr/dataSource");
        descriptor1 = new Path(dataSourceDir, "interval1/v1/0/" + DESCRIPTOR_JSON);
        descriptor2 = new Path(dataSourceDir, "interval2/v1/0/" + DESCRIPTOR_JSON);
        descriptor3 = new Path(dataSourceDir, "interval3/v2/0/" + DESCRIPTOR_JSON);
        descriptor4_0 = new Path(dataSourceDir, "interval4/v1/0/" + DESCRIPTOR_JSON);
        descriptor4_1 = new Path(dataSourceDir, "interval4/v1/1/" + DESCRIPTOR_JSON);
        indexZip1 = new Path(descriptor1.getParent(), INDEX_ZIP);
        indexZip2 = new Path(descriptor2.getParent(), INDEX_ZIP);
        indexZip3 = new Path(descriptor3.getParent(), INDEX_ZIP);
        indexZip4_0 = new Path(descriptor4_0.getParent(), INDEX_ZIP);
        indexZip4_1 = new Path(descriptor4_1.getParent(), INDEX_ZIP);

        mapper.writeValue(fs.create(descriptor1), SEGMENT_1);
        mapper.writeValue(fs.create(descriptor2), SEGMENT_2);
        mapper.writeValue(fs.create(descriptor3), SEGMENT_3);
        mapper.writeValue(fs.create(descriptor4_0), SEGMENT_4_0);
        mapper.writeValue(fs.create(descriptor4_1), SEGMENT_4_1);

        create(indexZip1);
        create(indexZip2);
        create(indexZip3);
        create(indexZip4_0);
        create(indexZip4_1);
    }

    private void create(Path indexZip1) throws IOException {
        try (FSDataOutputStream os = fs.create(indexZip1)) {
        }
    }

    @Test
    public void testFindSegments() throws Exception {
        final HdfsDataSegmentFinder hdfsDataSegmentFinder = new HdfsDataSegmentFinder(conf, mapper);

        final Set<DataSegment> segments = hdfsDataSegmentFinder.findSegments(dataSourceDir.toString(), false);

        Assert.assertEquals(5, segments.size());

        DataSegment updatedSegment1 = null;
        DataSegment updatedSegment2 = null;
        DataSegment updatedSegment3 = null;
        DataSegment updatedSegment4_0 = null;
        DataSegment updatedSegment4_1 = null;
        for (DataSegment dataSegment : segments) {
            if (dataSegment.getIdentifier().equals(SEGMENT_1.getIdentifier())) {
                updatedSegment1 = dataSegment;
            } else if (dataSegment.getIdentifier().equals(SEGMENT_2.getIdentifier())) {
                updatedSegment2 = dataSegment;
            } else if (dataSegment.getIdentifier().equals(SEGMENT_3.getIdentifier())) {
                updatedSegment3 = dataSegment;
            } else if (dataSegment.getIdentifier().equals(SEGMENT_4_0.getIdentifier())) {
                updatedSegment4_0 = dataSegment;
            } else if (dataSegment.getIdentifier().equals(SEGMENT_4_1.getIdentifier())) {
                updatedSegment4_1 = dataSegment;
            } else {
                Assert.fail("Unexpected segment");
            }
        }

        Assert.assertEquals(descriptor1.toUri().getPath(), getDescriptorPath(updatedSegment1));
        Assert.assertEquals(descriptor2.toUri().getPath(), getDescriptorPath(updatedSegment2));
        Assert.assertEquals(descriptor3.toUri().getPath(), getDescriptorPath(updatedSegment3));
        Assert.assertEquals(descriptor4_0.toUri().getPath(), getDescriptorPath(updatedSegment4_0));
        Assert.assertEquals(descriptor4_1.toUri().getPath(), getDescriptorPath(updatedSegment4_1));

        final String serializedSegment1 = mapper.writeValueAsString(updatedSegment1);
        final String serializedSegment2 = mapper.writeValueAsString(updatedSegment2);
        final String serializedSegment3 = mapper.writeValueAsString(updatedSegment3);
        final String serializedSegment4_0 = mapper.writeValueAsString(updatedSegment4_0);
        final String serializedSegment4_1 = mapper.writeValueAsString(updatedSegment4_1);

        // since updateDescriptor was not enabled, descriptor.json still has stale information
        Assert.assertNotEquals(serializedSegment1, readContent(descriptor1));
        Assert.assertNotEquals(serializedSegment2, readContent(descriptor2));
        Assert.assertNotEquals(serializedSegment3, readContent(descriptor3));
        Assert.assertNotEquals(serializedSegment4_0, readContent(descriptor4_0));
        Assert.assertNotEquals(serializedSegment4_1, readContent(descriptor4_1));

        // enable updateDescriptor so that descriptors.json will be updated to relfect the new loadSpec
        final Set<DataSegment> segments2 = hdfsDataSegmentFinder.findSegments(dataSourceDir.toString(), true);

        Assert.assertEquals(segments, segments2);
        Assert.assertEquals(serializedSegment1, readContent(descriptor1));
        Assert.assertEquals(serializedSegment2, readContent(descriptor2));
        Assert.assertEquals(serializedSegment3, readContent(descriptor3));
        Assert.assertEquals(serializedSegment4_0, readContent(descriptor4_0));
        Assert.assertEquals(serializedSegment4_1, readContent(descriptor4_1));
    }

    @Test(expected = SegmentLoadingException.class)
    public void testFindSegmentsFail() throws Exception {
        // remove one of index.zip while keeping its descriptor.json
        fs.delete(indexZip4_1, false);

        final HdfsDataSegmentFinder hdfsDataSegmentFinder = new HdfsDataSegmentFinder(conf, mapper);
        hdfsDataSegmentFinder.findSegments(dataSourceDir.toString(), false);
    }

    @Test(expected = SegmentLoadingException.class)
    public void testFindSegmentsFail2() throws SegmentLoadingException {
        // will fail to desierialize descriptor.json because DefaultObjectMapper doesn't recognize NumberedShardSpec
        final HdfsDataSegmentFinder hdfsDataSegmentFinder = new HdfsDataSegmentFinder(conf,
                new DefaultObjectMapper());
        try {
            hdfsDataSegmentFinder.findSegments(dataSourceDir.toString(), false);
        } catch (SegmentLoadingException e) {
            Assert.assertTrue(e.getCause() instanceof IOException);
            throw e;
        }
    }

    private String getDescriptorPath(DataSegment segment) {
        final Path indexzip = new Path(String.valueOf(segment.getLoadSpec().get("path")));
        return indexzip.getParent().toString() + "/" + DESCRIPTOR_JSON;
    }

    private String readContent(Path descriptor) throws IOException {
        final FSDataInputStream is = fs.open(descriptor);
        final String content = IOUtils.toString(is);
        is.close();
        return content;
    }
}