Java tutorial
/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.cdk.data.filesystem; import com.cloudera.cdk.data.DatasetDescriptor; import com.cloudera.cdk.data.DatasetReader; import com.cloudera.cdk.data.TestDatasetReaders; import com.cloudera.cdk.data.TestHelpers; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; import org.apache.avro.generic.GenericData; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import java.io.IOException; public class TestCSVFileReader extends TestDatasetReaders<GenericData.Record> { /* * OpenCSV notes: * - An empty unquoted field is passed as an empty string */ public static final String CSV_CONTENT = ("str,34,2.11,false\r\n" + "\"str,2\",,4,true\n" + "str3,\"\",null"); public static final String VALIDATOR_CSV_CONTENT = "id,string,even\n" + "0,a,true\n" + "1,b\n" + "2,c,true\n"; public static final String TSV_CONTENT = ("string\tinteger\tfloat\tbool\r" + "str\t34\t2.11\tfalse\r\n" + "\"str\t2\"\t\t4\ttrue\n" + "str3\t\"\"\tnull"); public static FileSystem localfs = null; public static Path csvFile = null; public static Path validatorFile = null; public static Path tsvFile = null; public static Schema STRINGS = SchemaBuilder.record("Strings").fields().name("string1").type().stringType() .noDefault().name("string2").type().stringType().noDefault().name("string3").type().stringType() .noDefault().name("string4").type().stringType().stringDefault("missing value").endRecord(); public static final Schema VALIDATOR_SCHEMA = SchemaBuilder.record("Validator").fields().name("id").type() .intType().noDefault().name("string").type().stringType().noDefault().name("even").type().booleanType() .booleanDefault(false).endRecord(); public static Schema BEAN_SCHEMA = SchemaBuilder.record(TestBean.class.getName()).fields().name("myString") .type().stringType().noDefault().name("myInt").type().intType().intDefault(0).name("myFloat").type() .floatType().noDefault().name("myBool").type().booleanType().booleanDefault(false).endRecord(); public static Schema SCHEMA = SchemaBuilder.record("Normal").fields().name("myString").type().stringType() .noDefault().name("myInt").type().intType().intDefault(0).name("myFloat").type().floatType().noDefault() .name("myBool").type().booleanType().booleanDefault(false).endRecord(); @BeforeClass public static void createCSVFiles() throws IOException { localfs = FileSystem.getLocal(new Configuration()); csvFile = new Path("target/temp.csv"); tsvFile = new Path("target/temp.tsv"); validatorFile = new Path("target/validator.csv"); FSDataOutputStream out = localfs.create(csvFile, true); out.writeBytes(CSV_CONTENT); out.close(); out = localfs.create(validatorFile, true); out.writeBytes(VALIDATOR_CSV_CONTENT); out.close(); out = localfs.create(tsvFile, true); out.writeBytes(TSV_CONTENT); out.close(); } @Override public DatasetReader<GenericData.Record> newReader() throws IOException { final DatasetDescriptor desc = new DatasetDescriptor.Builder().property("cdk.csv.lines-to-skip", "1") .schema(VALIDATOR_SCHEMA).build(); return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc); } @Override public int getTotalRecords() { return 3; } @Override public DatasetTestUtilities.RecordValidator<GenericData.Record> getValidator() { return new DatasetTestUtilities.RecordValidator<GenericData.Record>() { private static final String chars = "abcdef"; @Override public void validate(GenericData.Record record, int recordNum) { Assert.assertEquals(recordNum, record.get("id")); Assert.assertEquals(Character.toString(chars.charAt(recordNum)), record.get("string")); Assert.assertEquals((recordNum % 2) == 0, record.get("even")); } }; } @Test(expected = IllegalArgumentException.class) public void testRejectsNonRecordSchemas() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(SchemaBuilder.array().items().stringType()).build(); new CSVFileReader(localfs, csvFile, desc); } @Test public void testStringSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder().schema(STRINGS).build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc); reader.open(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals("34", rec.get(1)); Assert.assertEquals("2.11", rec.get(2)); Assert.assertEquals("false", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str,2", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("4", rec.get(2)); Assert.assertEquals("true", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str3", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("null", rec.get(2)); Assert.assertEquals("missing value", rec.get(3)); Assert.assertFalse(reader.hasNext()); } @Test public void testTSV() { final DatasetDescriptor desc = new DatasetDescriptor.Builder().property("cdk.csv.delimiter", "\t") .property("cdk.csv.lines-to-skip", "1").schema(STRINGS).build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc); reader.open(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals("34", rec.get(1)); Assert.assertEquals("2.11", rec.get(2)); Assert.assertEquals("false", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str\t2", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("4", rec.get(2)); Assert.assertEquals("true", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str3", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("null", rec.get(2)); Assert.assertEquals("missing value", rec.get(3)); Assert.assertFalse(reader.hasNext()); } @Test public void testNormalSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder().schema(SCHEMA).build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc); reader.open(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals(34, rec.get(1)); Assert.assertEquals(2.11f, rec.get(2)); Assert.assertEquals(false, rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str,2", rec.get(0)); Assert.assertEquals(0, rec.get(1)); Assert.assertEquals(4.0f, rec.get(2)); Assert.assertEquals(true, rec.get(3)); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about missing default", AvroRuntimeException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertFalse(reader.hasNext()); } @Test public void testReflectedRecords() { final DatasetDescriptor desc = new DatasetDescriptor.Builder().schema(BEAN_SCHEMA).build(); final CSVFileReader<TestBean> reader = new CSVFileReader<TestBean>(localfs, csvFile, desc); reader.open(); Assert.assertTrue(reader.hasNext()); TestBean bean = reader.next(); Assert.assertEquals("str", bean.myStr); Assert.assertEquals((Integer) 34, bean.myInt); Assert.assertEquals((Float) 2.11f, bean.myFloat); Assert.assertEquals(false, bean.myBool); Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str,2", bean.myStr); Assert.assertEquals((Integer) 0, bean.myInt); Assert.assertEquals((Float) 4.0f, bean.myFloat); Assert.assertEquals(true, bean.myBool); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about missing default", AvroRuntimeException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertFalse(reader.hasNext()); } }