io.druid.indexer.HadoopDruidIndexerConfigTest.java Source code

Introduction

Here is the source code for io.druid.indexer.HadoopDruidIndexerConfigTest.java
Source

/*
 * Druid - a distributed column store.
 * Copyright 2012 - 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.druid.indexer;

import com.fasterxml.jackson.databind.InjectableValues;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.metamx.common.Granularity;
import io.druid.data.input.MapBasedInputRow;
import io.druid.granularity.QueryGranularity;
import io.druid.jackson.DefaultObjectMapper;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.segment.indexing.DataSchema;
import io.druid.segment.indexing.granularity.UniformGranularitySpec;
import io.druid.timeline.partition.HashBasedNumberedShardSpec;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.junit.Assert;
import org.junit.Test;

import java.util.Arrays;
import java.util.List;

/**
 */
public class HadoopDruidIndexerConfigTest {
    private static final ObjectMapper jsonMapper;
    static {
        jsonMapper = new DefaultObjectMapper();
        jsonMapper.setInjectableValues(new InjectableValues.Std().addValue(ObjectMapper.class, jsonMapper));
    }

    public static <T> T jsonReadWriteRead(String s, Class<T> klass) {
        try {
            return jsonMapper.readValue(jsonMapper.writeValueAsBytes(jsonMapper.readValue(s, klass)), klass);
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    @Test
    public void shouldMakeHDFSCompliantSegmentOutputPath() {
        HadoopIngestionSpec schema;

        try {
            schema = jsonReadWriteRead("{\n" + "    \"dataSchema\": {\n" + "        \"dataSource\": \"source\",\n"
                    + "        \"metricsSpec\": [],\n" + "        \"granularitySpec\": {\n"
                    + "            \"type\": \"uniform\",\n" + "            \"segmentGranularity\": \"hour\",\n"
                    + "            \"intervals\": [\"2012-07-10/P1D\"]\n" + "        }\n" + "    },\n"
                    + "    \"ioConfig\": {\n" + "        \"type\": \"hadoop\",\n"
                    + "        \"segmentOutputPath\": \"hdfs://server:9100/tmp/druid/datatest\"\n" + "    }\n"
                    + "}", HadoopIngestionSpec.class);
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }

        HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(
                schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));

        Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30), 4712);
        Path path = JobHelper.makeSegmentOutputPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()),
                new DistributedFileSystem(), cfg.getSchema().getDataSchema().getDataSource(),
                cfg.getSchema().getTuningConfig().getVersion(),
                cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(),
                bucket.partitionNum);
        Assert.assertEquals(
                "hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version/4712",
                path.toString());
    }

    @Test
    public void shouldMakeDefaultSegmentOutputPathIfNotHDFS() {
        final HadoopIngestionSpec schema;

        try {
            schema = jsonReadWriteRead(
                    "{\n" + "    \"dataSchema\": {\n" + "        \"dataSource\": \"the:data:source\",\n"
                            + "        \"metricsSpec\": [],\n" + "        \"granularitySpec\": {\n"
                            + "            \"type\": \"uniform\",\n"
                            + "            \"segmentGranularity\": \"hour\",\n"
                            + "            \"intervals\": [\"2012-07-10/P1D\"]\n" + "        }\n" + "    },\n"
                            + "    \"ioConfig\": {\n" + "        \"type\": \"hadoop\",\n"
                            + "        \"segmentOutputPath\": \"/tmp/dru:id/data:test\"\n" + "    }\n" + "}",
                    HadoopIngestionSpec.class);
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }

        HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(
                schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));

        Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30), 4712);
        Path path = JobHelper.makeSegmentOutputPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()),
                new LocalFileSystem(), cfg.getSchema().getDataSchema().getDataSource(),
                cfg.getSchema().getTuningConfig().getVersion(),
                cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(),
                bucket.partitionNum);
        Assert.assertEquals(
                "file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:version/4712",
                path.toString());

    }

    @Test
    public void testHashedBucketSelection() {
        List<HadoopyShardSpec> specs = Lists.newArrayList();
        final int partitionCount = 10;
        for (int i = 0; i < partitionCount; i++) {
            specs.add(new HadoopyShardSpec(
                    new HashBasedNumberedShardSpec(i, partitionCount, new DefaultObjectMapper()), i));
        }

        HadoopIngestionSpec spec = new HadoopIngestionSpec(
                new DataSchema("foo", null, new AggregatorFactory[0],
                        new UniformGranularitySpec(Granularity.MINUTE, QueryGranularity.MINUTE,
                                ImmutableList.of(new Interval("2010-01-01/P1D"))),
                        jsonMapper),
                new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar", "type", "static"), null, null),
                new HadoopTuningConfig(null, null, null,
                        ImmutableMap.of(new DateTime("2010-01-01T01:00:00"), specs), null, null, false, false,
                        false, false, null, false, false, false, null, null, false));
        HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec(spec);
        final List<String> dims = Arrays.asList("diM1", "dIM2");
        final ImmutableMap<String, Object> values = ImmutableMap.<String, Object>of("Dim1", "1", "DiM2", "2",
                "dim1", "3", "dim2", "4");
        final long timestamp = new DateTime("2010-01-01T01:00:01").getMillis();
        final Bucket expectedBucket = config.getBucket(new MapBasedInputRow(timestamp, dims, values)).get();
        final long nextBucketTimestamp = QueryGranularity.MINUTE.next(QueryGranularity.MINUTE.truncate(timestamp));
        // check that all rows having same set of dims and truncated timestamp hash to same bucket
        for (int i = 0; timestamp + i < nextBucketTimestamp; i++) {
            Assert.assertEquals(expectedBucket.partitionNum,
                    config.getBucket(new MapBasedInputRow(timestamp + i, dims, values)).get().partitionNum);
        }

    }
}