Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie.hadoop; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.common.util.SchemaTestUtil; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.UUID; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.parquet.avro.AvroParquetWriter; import org.junit.rules.TemporaryFolder; public class InputFormatTestUtil { private static String TEST_WRITE_TOKEN = "1-0-1"; public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles, String commitNumber) throws IOException { basePath.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); for (int i = 0; i < numberOfFiles; i++) { File dataFile = new File(partitionPath, FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i)); dataFile.createNewFile(); } return partitionPath; } public static void simulateUpdates(File directory, final String originalCommit, int numberOfFilesUpdated, String newCommit, boolean randomize) throws IOException { List<File> dataFiles = Arrays.asList(directory.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { String commitTs = FSUtils.getCommitTime(name); return originalCommit.equals(commitTs); } })); if (randomize) { Collections.shuffle(dataFiles); } List<File> toUpdateList = dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size())); for (File file : toUpdateList) { String fileId = FSUtils.getFileId(file.getName()); File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId)); dataFile.createNewFile(); } } public static void commit(TemporaryFolder basePath, String commitNumber) throws IOException { // create the commit new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".commit").createNewFile(); } public static void deltaCommit(TemporaryFolder basePath, String commitNumber) throws IOException { // create the commit new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".deltacommit").createNewFile(); } public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE); String startCommitTimestampName = String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.set(startCommitTimestampName, startCommit); String maxCommitPulls = String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); } public static Schema readSchema(String location) throws IOException { return new Schema.Parser().parse(InputFormatTestUtil.class.getResourceAsStream(location)); } public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { basePath.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); createData(schema, partitionPath, numberOfFiles, numberOfRecords, commitNumber); return partitionPath; } public static File prepareSimpleParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, int numberOfRecords, String commitNumber) throws Exception { basePath.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString()); File partitionPath = basePath.newFolder("2016", "05", "01"); createSimpleData(schema, partitionPath, numberOfFiles, numberOfRecords, commitNumber); return partitionPath; } public static File prepareNonPartitionedParquetDataset(TemporaryFolder baseDir, Schema schema, int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { baseDir.create(); HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), baseDir.getRoot().toString()); File basePath = baseDir.getRoot(); createData(schema, basePath, numberOfFiles, numberOfRecords, commitNumber); return basePath; } private static void createData(Schema schema, File partitionPath, int numberOfFiles, int numberOfRecords, String commitNumber) throws IOException { AvroParquetWriter parquetWriter; for (int i = 0; i < numberOfFiles; i++) { String fileId = FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i); File dataFile = new File(partitionPath, fileId); parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), schema); try { for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber, fileId)) { parquetWriter.write(record); } } finally { parquetWriter.close(); } } } private static void createSimpleData(Schema schema, File partitionPath, int numberOfFiles, int numberOfRecords, String commitNumber) throws Exception { AvroParquetWriter parquetWriter; for (int i = 0; i < numberOfFiles; i++) { String fileId = FSUtils.makeDataFileName(commitNumber, "1", "fileid" + i); File dataFile = new File(partitionPath, fileId); parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), schema); try { List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, numberOfRecords); String commitTime = HoodieActiveTimeline.createNewCommitTime(); Schema hoodieFieldsSchema = HoodieAvroUtils.addMetadataFields(schema); for (IndexedRecord record : records) { GenericRecord p = HoodieAvroUtils.rewriteRecord((GenericRecord) record, hoodieFieldsSchema); p.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, UUID.randomUUID().toString()); p.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, "0000/00/00"); p.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitNumber); parquetWriter.write(p); } } finally { parquetWriter.close(); } } } private static Iterable<? extends GenericRecord> generateAvroRecords(Schema schema, int numberOfRecords, String commitTime, String fileId) throws IOException { List<GenericRecord> records = new ArrayList<>(numberOfRecords); for (int i = 0; i < numberOfRecords; i++) { records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, commitTime, fileId)); } return records; } public static void simulateParquetUpdates(File directory, Schema schema, String originalCommit, int totalNumberOfRecords, int numberOfRecordsToUpdate, String newCommit) throws IOException { File fileToUpdate = directory.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith("parquet"); } })[0]; String fileId = FSUtils.getFileId(fileToUpdate.getName()); File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId)); AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()), schema); try { for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords, originalCommit, fileId)) { if (numberOfRecordsToUpdate > 0) { // update this record record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit); String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD); record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, oldSeqNo.replace(originalCommit, newCommit)); numberOfRecordsToUpdate--; } parquetWriter.write(record); } } finally { parquetWriter.close(); } } }