Java tutorial
/* * Copyright 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.hydrator.plugin.batch.source; import co.cask.cdap.api.artifact.ArtifactVersion; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.dataset.lib.KeyValueTable; import co.cask.cdap.api.dataset.table.Scanner; import co.cask.cdap.api.dataset.table.Table; import co.cask.cdap.etl.api.batch.BatchSource; import co.cask.cdap.etl.batch.ETLBatchApplication; import co.cask.cdap.etl.batch.mapreduce.ETLMapReduce; import co.cask.cdap.etl.mock.batch.MockSink; import co.cask.cdap.etl.mock.test.HydratorTestBase; import co.cask.cdap.etl.proto.v2.ETLBatchConfig; import co.cask.cdap.etl.proto.v2.ETLPlugin; import co.cask.cdap.etl.proto.v2.ETLStage; import co.cask.cdap.proto.artifact.AppRequest; import co.cask.cdap.proto.artifact.ArtifactSummary; import co.cask.cdap.proto.id.ApplicationId; import co.cask.cdap.proto.id.ArtifactId; import co.cask.cdap.proto.id.NamespaceId; import co.cask.cdap.test.ApplicationManager; import co.cask.cdap.test.DataSetManager; import co.cask.cdap.test.MapReduceManager; import co.cask.cdap.test.TestConfiguration; import co.cask.hydrator.common.Constants; import com.google.common.collect.ImmutableMap; import org.apache.commons.io.FileUtils; import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import java.io.File; import java.net.URL; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; /** * Unit tests for {@link ExcelInputReader} class. */ public class ExcelInputReaderTest extends HydratorTestBase { private static final ArtifactVersion CURRENT_VERSION = new ArtifactVersion("3.4.0-SNAPSHOT"); private static final ArtifactId BATCH_APP_ARTIFACT_ID = NamespaceId.DEFAULT.artifact("etlbatch", CURRENT_VERSION.getVersion()); private static final ArtifactSummary ETLBATCH_ARTIFACT = new ArtifactSummary( BATCH_APP_ARTIFACT_ID.getArtifact(), BATCH_APP_ARTIFACT_ID.getVersion()); @ClassRule public static final TestConfiguration CONFIG = new TestConfiguration("explore.enabled", false); @ClassRule public static TemporaryFolder temporaryFolder = new TemporaryFolder(); private static File sourceFolder; private static String sourceFolderUri; private static String excelTestFileOne = "/civil_test_data_one.xlsx"; private static String excelTestFileTwo = "/civil_test_data_two.xlsx"; @BeforeClass public static void setupTest() throws Exception { setupBatchArtifacts(BATCH_APP_ARTIFACT_ID, ETLBatchApplication.class); // add artifact for batch sources and sinks addPluginArtifact(NamespaceId.DEFAULT.artifact("excelreader-plugins", "4.0.0"), BATCH_APP_ARTIFACT_ID, ExcelInputReader.class); sourceFolder = temporaryFolder.newFolder("ExcelInputReaderFolder"); sourceFolderUri = sourceFolder.toURI().toString(); } @AfterClass public static void tearDown() throws Exception { temporaryFolder.delete(); } @Before public void copyFiles() throws Exception { URL testFileUrl = this.getClass().getResource(excelTestFileOne); URL testTwofileUrl = this.getClass().getResource(excelTestFileTwo); FileUtils.copyFile(new File(testFileUrl.getFile()), new File(sourceFolder, excelTestFileOne)); FileUtils.copyFile(new File(testTwofileUrl.getFile()), new File(sourceFolder, excelTestFileTwo)); } @Test public void testExcelInputReader() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase-testExcelInputReader") .put("filePath", sourceFolderUri).put("filePattern", ".*").put("sheetName", "Sheet1") .put("sheetNo", "-1").put("memoryTableName", "trackMemoryTable").put("reprocess", "false") .put("columnList", "").put("columnMapping", "").put("skipFirstRow", "false") .put("terminateIfEmptyRow", "false").put("rowsLimit", "").put("outputSchema", "A:string,B:string") .put("ifErrorRecord", "Ignore error and continue").put("errorDatasetName", "").build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-testExcelInputReader"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); ApplicationManager appManager = deployApp(source, sink, "ExcelInputReaderTests"); startMRFlow(appManager); DataSetManager<Table> outputManager = getDataset(outputDatasetName); List<StructuredRecord> output = MockSink.readOutput(outputManager); Map<String, String> nameIdMap = new HashMap<String, String>(); nameIdMap.put("john", "3.0"); nameIdMap.put("romy", "1.0"); nameIdMap.put("Paulo", "11.0"); nameIdMap.put("Ruskin", "10.0"); nameIdMap.put("Alan", "8.0"); nameIdMap.put("Bill", "13.0"); nameIdMap.put("Ada", "14.0"); nameIdMap.put("kelly", "9.0"); nameIdMap.put("name", "id"); Assert.assertEquals(nameIdMap.get(output.get(0).get("B")), output.get(0).get("A")); Assert.assertEquals(nameIdMap.get(output.get(1).get("B")), output.get(1).get("A")); Assert.assertEquals(nameIdMap.get(output.get(2).get("B")), output.get(2).get("A")); Assert.assertEquals(nameIdMap.get(output.get(3).get("B")), output.get(3).get("A")); Assert.assertEquals("Expected records", 9, output.size()); } @Test public void testWithReProcessTrue() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase").put("filePath", sourceFolderUri) .put("filePattern", ".*").put("sheetName", "Sheet1").put("sheetNo", "-1") .put("memoryTableName", "trackMemoryTableWithReProcessedTrue").put("reprocess", "true") .put("columnList", "").put("columnMapping", "A:FirstColumn").put("skipFirstRow", "false") .put("terminateIfEmptyRow", "false").put("rowsLimit", "10").put("outputSchema", "A:string") .put("ifErrorRecord", "Ignore error and continue").put("errorDatasetName", "").build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-batchsourcetest"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); ApplicationManager appManager = deployApp(source, sink, "testWithReProcessedTrue"); DataSetManager<KeyValueTable> dataSetManager = getDataset("trackMemoryTableWithReProcessedTrue"); KeyValueTable keyValueTable = dataSetManager.get(); File testFile = new File(sourceFolder, excelTestFileTwo); keyValueTable.write(testFile.toURI().toString(), String.valueOf(System.currentTimeMillis())); dataSetManager.flush(); startMRFlow(appManager); DataSetManager<Table> outputManager = getDataset(outputDatasetName); List<StructuredRecord> output = MockSink.readOutput(outputManager); Assert.assertEquals("Expected records", 9, output.size()); Assert.assertNotNull(output.get(1).getSchema().getField("FirstColumn")); } @Test public void testWithReProcessedFalse() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase").put("filePath", sourceFolderUri) .put("filePattern", ".*").put("sheetName", "Sheet1").put("sheetNo", "-1") .put("memoryTableName", "trackMemoryTableWithReProcessedFalse").put("reprocess", "false") .put("columnList", "").put("columnMapping", "").put("skipFirstRow", "false") .put("terminateIfEmptyRow", "false").put("rowsLimit", "10").put("outputSchema", "A:string,B:string") .put("ifErrorRecord", "Ignore error and continue").put("errorDatasetName", "").build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-testWithReProcessedFalse"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); ApplicationManager appManager = deployApp(source, sink, "testWithReProcessedTrue"); DataSetManager<KeyValueTable> dataSetManager = getDataset("trackMemoryTableWithReProcessedFalse"); KeyValueTable keyValueTable = dataSetManager.get(); File testFile = new File(sourceFolder, excelTestFileTwo); keyValueTable.write(testFile.toURI().toString(), String.valueOf(System.currentTimeMillis())); dataSetManager.flush(); startMRFlow(appManager); DataSetManager<Table> outputManager = getDataset(outputDatasetName); List<StructuredRecord> output = MockSink.readOutput(outputManager); Map<String, String> nameIdMap = new HashMap<String, String>(); nameIdMap.put("john", "3.0"); nameIdMap.put("romy", "1.0"); nameIdMap.put("name", "id"); Assert.assertEquals("Expected records", 3, output.size()); Assert.assertEquals(nameIdMap.get(output.get(0).get("B")), output.get(0).get("A")); Assert.assertEquals(nameIdMap.get(output.get(1).get("B")), output.get(1).get("A")); Assert.assertEquals(nameIdMap.get(output.get(2).get("B")), output.get(2).get("A")); } @Test public void testWithColumnsToBeExtracted() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase").put("filePath", sourceFolderUri) .put("filePattern", ".*").put("sheetName", "Sheet1").put("sheetNo", "-1") .put("memoryTableName", "trackMemoryTableWithColumnsToBeExtracted").put("reprocess", "false") .put("columnList", "A,B").put("columnMapping", "A:FirstColumn").put("skipFirstRow", "true") .put("terminateIfEmptyRow", "false").put("rowsLimit", "").put("outputSchema", "") .put("ifErrorRecord", "Ignore error and continue").put("errorDatasetName", "").build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-testWithColumnsToBeExtracted"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); ApplicationManager appManager = deployApp(source, sink, "testWithColumnsToBeExtracted"); startMRFlow(appManager); DataSetManager<Table> outputManager = getDataset(outputDatasetName); List<StructuredRecord> output = MockSink.readOutput(outputManager); Assert.assertEquals("Expected records", 7, output.size()); Assert.assertNotNull(output.get(1).getSchema().getField("FirstColumn")); Assert.assertNotNull(output.get(1).getSchema().getFields().contains("B")); Map<String, String> nameIdMap = new HashMap<String, String>(); nameIdMap.put("john", "3.0"); nameIdMap.put("romy", "1.0"); nameIdMap.put("Paulo", "11.0"); nameIdMap.put("Ruskin", "10.0"); nameIdMap.put("Bill", "13.0"); nameIdMap.put("Ada", "14.0"); nameIdMap.put("kelly", "9.0"); Assert.assertEquals(nameIdMap.get(output.get(0).get("B")), output.get(0).get("FirstColumn")); Assert.assertEquals(nameIdMap.get(output.get(1).get("B")), output.get(1).get("FirstColumn")); Assert.assertEquals(nameIdMap.get(output.get(2).get("B")), output.get(2).get("FirstColumn")); } @Test public void testWithErrorRecord() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase").put("filePath", sourceFolderUri) .put("filePattern", ".*").put("sheetName", "Sheet1").put("sheetNo", "-1") .put("memoryTableName", "trackMemoryTableWithOutputSchema").put("reprocess", "false") .put("columnList", "A,B,C").put("columnMapping", "A:FirstColumn").put("skipFirstRow", "false") .put("terminateIfEmptyRow", "false").put("rowsLimit", "").put("outputSchema", "A:double,C:double") .put("ifErrorRecord", "Write to error dataset").put("errorDatasetName", "error-dataset-table") .build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-testWithOutputSchema"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); ApplicationManager appManager = deployApp(source, sink, "testWithOutputSchema"); startMRFlow(appManager); DataSetManager<Table> outputManager = getDataset(outputDatasetName); List<StructuredRecord> output = MockSink.readOutput(outputManager); Assert.assertEquals("Expected records", 6, output.size()); DataSetManager<Table> errorTableManager = getDataset("error-dataset-table"); Table errorTable = errorTableManager.get(); Scanner scanner = errorTable.scan(null, null); int counter = 0; while (scanner.next() != null) { counter++; } Assert.assertEquals("Expected error records", 3, counter); } @Test(expected = IllegalStateException.class) public void testWithNoColumnListAndNoOutputSchema() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase").put("filePath", sourceFolderUri) .put("filePattern", ".*").put("sheetName", "Sheet1").put("sheetNo", "-1") .put("memoryTableName", "trackMemoryTableWithNoColumnListAndOutputSchema").put("reprocess", "false") .put("columnList", "").put("columnMapping", "").put("skipFirstRow", "false") .put("terminateIfEmptyRow", "false").put("rowsLimit", "").put("outputSchema", "") .put("ifErrorRecord", "Ignore error and continue").put("errorDatasetName", "").build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-testWithNoColumnListAndOutputSchema"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); deployApp(source, sink, "testWithNoColumnListAndOutputSchema"); Assert.fail(); } @Test public void testWithTerminateIfEmptyRow() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase").put("filePath", sourceFolderUri) .put("filePattern", ".*").put("sheetName", "Sheet1").put("sheetNo", "-1") .put("memoryTableName", "trackMemoryTableWithTerminateIfEmptyRow").put("reprocess", "false") .put("columnList", "").put("columnMapping", "").put("skipFirstRow", "false") .put("terminateIfEmptyRow", "true").put("rowsLimit", "") .put("outputSchema", "A:string,B:string,C:string,D:string,Romy:string") .put("ifErrorRecord", "Ignore error and continue").put("errorDatasetName", "").build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-testWithTerminateIfEmptyRow"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); ApplicationManager appManager = deployApp(source, sink, "testWithTerminateIfEmptyRow"); MapReduceManager mrManager = startMRFlow(appManager); Assert.assertEquals("Expected :", "FAILED", mrManager.getHistory().get(0).getStatus().name()); } @Test(expected = IllegalStateException.class) public void testWithNoErrorDataset() throws Exception { Map<String, String> sourceProperties = new ImmutableMap.Builder<String, String>() .put(Constants.Reference.REFERENCE_NAME, "TestCase").put("filePath", sourceFolderUri) .put("filePattern", ".*").put("sheetName", "").put("sheetNo", "1") .put("memoryTableName", "trackMemoryTableWithSkipFirstRow").put("reprocess", "false") .put("columnList", "A").put("columnMapping", "").put("skipFirstRow", "false") .put("terminateIfEmptyRow", "false").put("rowsLimit", "").put("outputSchema", "") .put("ifErrorRecord", "Write to error dataset").put("errorDatasetName", "").build(); ETLStage source = new ETLStage("ExcelInputtest", new ETLPlugin("ExcelInputReader", BatchSource.PLUGIN_TYPE, sourceProperties, null)); String outputDatasetName = "output-testWithSkipFirstRow"; ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputDatasetName)); deployApp(source, sink, "testWithSkipFirstRow"); Assert.fail(); } private ApplicationManager deployApp(ETLStage source, ETLStage sink, String appName) throws Exception { ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink) .addConnection(source.getName(), sink.getName()).build(); AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(ETLBATCH_ARTIFACT, etlConfig); ApplicationId appId = NamespaceId.DEFAULT.app(appName); return deployApplication(appId.toId(), appRequest); } private MapReduceManager startMRFlow(ApplicationManager appManager) throws Exception { MapReduceManager mrManager = appManager.getMapReduceManager(ETLMapReduce.NAME); mrManager.start(); mrManager.waitForFinish(5, TimeUnit.MINUTES); return mrManager; } }