Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.gobblin.compaction.mapreduce; import com.google.common.collect.ImmutableList; import com.google.common.io.Files; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.io.FilenameUtils; import org.apache.gobblin.configuration.State; import org.apache.gobblin.runtime.api.JobExecutionResult; import org.apache.gobblin.runtime.embedded.EmbeddedGobblin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.apache.orc.impl.ReaderImpl; import org.apache.orc.mapred.OrcStruct; import org.apache.orc.mapreduce.OrcMapreduceRecordReader; import org.apache.orc.mapreduce.OrcMapreduceRecordWriter; import org.testng.Assert; import org.testng.annotations.Test; import static org.apache.gobblin.compaction.mapreduce.AvroCompactionTaskTest.*; import static org.apache.gobblin.compaction.mapreduce.CompactorOutputCommitter.*; public class OrcCompactionTaskTest { @Test public void basicTest() throws Exception { File basePath = Files.createTempDir(); basePath.deleteOnExit(); String minutelyPath = "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20"; String hourlyPath = "Identity/MemberAccount/hourly/2017/04/03/10/"; File jobDir = new File(basePath, minutelyPath); Assert.assertTrue(jobDir.mkdirs()); // Write some ORC file for compaction here. TypeDescription schema = TypeDescription.fromString("struct<i:int,j:int>"); OrcStruct orcStruct_0 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_0.setFieldValue("i", new IntWritable(1)); orcStruct_0.setFieldValue("j", new IntWritable(2)); OrcStruct orcStruct_1 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_1.setFieldValue("i", new IntWritable(1)); orcStruct_1.setFieldValue("j", new IntWritable(2)); OrcStruct orcStruct_2 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_2.setFieldValue("i", new IntWritable(2)); orcStruct_2.setFieldValue("j", new IntWritable(3)); OrcStruct orcStruct_3 = (OrcStruct) OrcStruct.createValue(schema); orcStruct_3.setFieldValue("i", new IntWritable(4)); orcStruct_3.setFieldValue("j", new IntWritable(5)); File file_0 = new File(jobDir, "file_0"); File file_1 = new File(jobDir, "file_1"); writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), schema, ImmutableList.of(orcStruct_0, orcStruct_2)); writeOrcRecordsInFile(new Path(file_1.getAbsolutePath()), schema, ImmutableList.of(orcStruct_1, orcStruct_3)); // Verify execution // Overwrite the job configurator factory key. String extensionFileName = "orcavro"; EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin("basic", basePath.getAbsolutePath().toString()) .setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY, TestCompactionOrcJobConfigurator.Factory.class.getName()) .setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionFileName); JobExecutionResult execution = embeddedGobblin.run(); Assert.assertTrue(execution.isSuccessful()); // Result verification File outputDir = new File(basePath, hourlyPath); FileSystem fs = FileSystem.getLocal(new Configuration()); List<FileStatus> statuses = new ArrayList<>(); for (FileStatus status : fs.listStatus(new Path(outputDir.getAbsolutePath()), new PathFilter() { @Override public boolean accept(Path path) { return FilenameUtils.isExtension(path.getName(), extensionFileName); } })) { statuses.add(status); } Assert.assertTrue(statuses.size() == 1); List<OrcStruct> result = readOrcFile(statuses.get(0).getPath()); Assert.assertEquals(result.size(), 3); Assert.assertEquals(result.get(0).getFieldValue("i"), new IntWritable(1)); Assert.assertEquals(result.get(0).getFieldValue("j"), new IntWritable(2)); Assert.assertEquals(result.get(1).getFieldValue("i"), new IntWritable(2)); Assert.assertEquals(result.get(1).getFieldValue("j"), new IntWritable(3)); Assert.assertEquals(result.get(2).getFieldValue("i"), new IntWritable(4)); Assert.assertEquals(result.get(2).getFieldValue("j"), new IntWritable(5)); } /** * Read a output ORC compacted file into memory. */ public List<OrcStruct> readOrcFile(Path orcFilePath) throws IOException, InterruptedException { ReaderImpl orcReader = new ReaderImpl(orcFilePath, new OrcFile.ReaderOptions(new Configuration())); Reader.Options options = new Reader.Options().schema(orcReader.getSchema()); OrcMapreduceRecordReader recordReader = new OrcMapreduceRecordReader(orcReader, options); List<OrcStruct> result = new ArrayList<>(); while (recordReader.nextKeyValue()) { result.add(copyIntOrcStruct((OrcStruct) recordReader.getCurrentValue())); } return result; } private OrcStruct copyIntOrcStruct(OrcStruct record) { OrcStruct result = new OrcStruct(record.getSchema()); for (int i = 0; i < record.getNumFields(); i++) { IntWritable newCopy = new IntWritable(((IntWritable) record.getFieldValue(i)).get()); result.setFieldValue(i, newCopy); } return result; } public void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception { Configuration configuration = new Configuration(); OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema); Writer writer = OrcFile.createWriter(path, options); OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer); for (OrcStruct orcRecord : orcStructs) { recordWriter.write(NullWritable.get(), orcRecord); } recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID())); } private static class TestCompactionOrcJobConfigurator extends CompactionOrcJobConfigurator { public static class Factory implements CompactionJobConfigurator.ConfiguratorFactory { @Override public TestCompactionOrcJobConfigurator createConfigurator(State state) throws IOException { return new TestCompactionOrcJobConfigurator(state); } } @Override protected void setNumberOfReducers(Job job) throws IOException { job.setNumReduceTasks(1); } public TestCompactionOrcJobConfigurator(State state) throws IOException { super(state); } } }