Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.gobblin.source.extractor; import java.io.File; import java.io.IOException; import java.util.List; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; import org.apache.commons.io.FileUtils; import org.apache.hadoop.fs.Path; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.junit.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import org.apache.gobblin.configuration.ConfigurationKeys; import org.apache.gobblin.configuration.SourceState; import org.apache.gobblin.configuration.State; import org.apache.gobblin.configuration.WorkUnitState; import org.apache.gobblin.source.DatePartitionedAvroFileSource; import org.apache.gobblin.source.workunit.Extract.TableType; import org.apache.gobblin.source.workunit.MultiWorkUnit; import org.apache.gobblin.source.workunit.WorkUnit; import org.apache.gobblin.stream.RecordEnvelope; import org.apache.gobblin.writer.AvroDataWriterBuilder; import org.apache.gobblin.writer.DataWriter; import org.apache.gobblin.writer.DataWriterBuilder; import org.apache.gobblin.writer.Destination; import org.apache.gobblin.writer.PartitionedDataWriter; import org.apache.gobblin.writer.WriterOutputFormat; import org.apache.gobblin.writer.partitioner.TimeBasedAvroWriterPartitioner; import org.apache.gobblin.writer.partitioner.TimeBasedWriterPartitioner; /** * Unit tests for {@link DatePartitionedAvroFileExtractor}. * * @author Lorand Bendig */ @Test(groups = { "gobblin.source.extractor." }) public class DatePartitionedAvroFileExtractorTest { private static final String SIMPLE_CLASS_NAME = DatePartitionedAvroFileExtractorTest.class.getSimpleName(); private static final String TEST_ROOT_DIR = "/tmp/" + SIMPLE_CLASS_NAME + "-test"; private static final String STAGING_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "staging"; private static final String OUTPUT_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "job-output"; private static final String FILE_NAME = SIMPLE_CLASS_NAME + "-name.avro"; private static final String PARTITION_COLUMN_NAME = "timestamp"; private static final String PREFIX = "minutes"; private static final String SUFFIX = "test"; private static final String SOURCE_ENTITY = "testsource"; private static final String DATE_PATTERN = "yyyy/MM/dd/HH_mm"; private static final int RECORD_SIZE = 4; private static final String AVRO_SCHEMA = "{" + "\"type\" : \"record\"," + "\"name\" : \"User\"," + "\"namespace\" : \"example.avro\"," + "\"fields\" : [" + "{" + "\"name\" : \"" + PARTITION_COLUMN_NAME + "\"," + "\"type\" : \"long\"" + "}" + "]" + "}"; private Schema schema; private DataWriter<GenericRecord> writer; private DateTime startDateTime; private long[] recordTimestamps = new long[RECORD_SIZE]; private static final DateTimeZone TZ = DateTimeZone.forID(ConfigurationKeys.PST_TIMEZONE_NAME); @BeforeClass public void setUp() throws IOException { this.schema = new Schema.Parser().parse(AVRO_SCHEMA); //set up datetime objects DateTime now = new DateTime(TZ).minusHours(6); this.startDateTime = new DateTime(now.getYear(), now.getMonthOfYear(), now.getDayOfMonth(), now.getHourOfDay(), 30, 0, TZ); //create records, shift their timestamp by 1 minute DateTime recordDt = startDateTime; recordTimestamps[0] = recordDt.getMillis(); recordDt = recordDt.plusHours(4); for (int i = 1; i < RECORD_SIZE; i++) { recordDt = recordDt.plusMinutes(1); recordTimestamps[i] = recordDt.getMillis(); } // create dummy data partitioned by minutes State state = new State(); state.setProp(TimeBasedAvroWriterPartitioner.WRITER_PARTITION_COLUMNS, PARTITION_COLUMN_NAME); state.setProp(ConfigurationKeys.WRITER_BUFFER_SIZE, ConfigurationKeys.DEFAULT_BUFFER_SIZE); state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, STAGING_DIR); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, OUTPUT_DIR); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, SOURCE_ENTITY); state.setProp(ConfigurationKeys.WRITER_FILE_NAME, FILE_NAME); state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_PATTERN, DATE_PATTERN); state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_PREFIX, PREFIX); state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_SUFFIX, SUFFIX); state.setProp(ConfigurationKeys.WRITER_PARTITIONER_CLASS, TimeBasedAvroWriterPartitioner.class.getName()); DataWriterBuilder<Schema, GenericRecord> builder = new AvroDataWriterBuilder() .writeTo(Destination.of(Destination.DestinationType.HDFS, state)) .writeInFormat(WriterOutputFormat.AVRO).withWriterId("writer-1").withSchema(this.schema) .withBranches(1).forBranch(0); this.writer = new PartitionedDataWriter<Schema, GenericRecord>(builder, state); GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(this.schema); for (int i = 0; i < RECORD_SIZE; i++) { genericRecordBuilder.set(PARTITION_COLUMN_NAME, recordTimestamps[i]); this.writer.writeEnvelope(new RecordEnvelope<>(genericRecordBuilder.build())); } this.writer.close(); this.writer.commit(); } @Test public void testJobStateNotCopiedToWorkUnit() { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.prefix", PREFIX); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); String dummyKey = "dummy.job.config"; state.setProp(dummyKey, "dummy"); List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); for (WorkUnit wu : workunits) { if (wu instanceof MultiWorkUnit) { for (WorkUnit workUnit : ((MultiWorkUnit) wu).getWorkUnits()) { Assert.assertFalse(workUnit.contains(dummyKey)); } } else { Assert.assertFalse(wu.contains(dummyKey)); } } } @Test public void testReadPartitionsByMinute() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.prefix", PREFIX); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); verifyWorkUnits(workunits); } @Test public void testReadPartitionsByMinuteWithLeadtime() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.prefix", PREFIX); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); state.setProp("date.partitioned.source.partition.lead_time.size", "3"); state.setProp("date.partitioned.source.partition.lead_time.granularity", "HOUR"); /* * Since lead time is 3 hours, only the first WorkUnit (which is 6 hours old, rest are 2hrs) should get * picked up */ List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 1); verifyWorkUnits(workunits, workunits.size()); } @Test public void testWorksNoPrefix() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY + Path.SEPARATOR + PREFIX); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); verifyWorkUnits(workunits); } private void verifyWorkUnits(List<WorkUnit> workunits) throws IOException, DataRecordException { verifyWorkUnits(workunits, RECORD_SIZE); } private void verifyWorkUnits(List<WorkUnit> workunits, int expectedSize) throws DataRecordException, IOException { for (int i = 0; i < expectedSize; i++) { WorkUnit workUnit = ((MultiWorkUnit) workunits.get(i)).getWorkUnits().get(0); WorkUnitState wuState = new WorkUnitState(workunits.get(i), new State()); wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)); try (DatePartitionedAvroFileExtractor extractor = new DatePartitionedAvroFileExtractor(wuState);) { GenericRecord record = extractor.readRecord(null); Assert.assertEquals(recordTimestamps[i], record.get(PARTITION_COLUMN_NAME)); Assert.assertEquals(recordTimestamps[i], workUnit.getPropAsLong(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY)); } } } @AfterClass public void tearDown() throws IOException { this.writer.close(); FileUtils.deleteDirectory(new File(TEST_ROOT_DIR)); } }