Java tutorial
/******************************************************************************* * Copyright 2017 Capital One Services, LLC and Bitwise, Inc. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License *******************************************************************************/ package hydrograph.engine.cascading.scheme.parquet; import cascading.flow.FlowProcess; import cascading.scheme.Scheme; import cascading.scheme.SinkCall; import cascading.scheme.SourceCall; import cascading.tap.CompositeTap; import cascading.tap.Tap; import cascading.tap.TapException; import cascading.tap.hadoop.Hfs; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import hydrograph.engine.cascading.scheme.hive.parquet.HiveParquetSchemeHelper; import hydrograph.engine.cascading.scheme.hive.parquet.WritableFactory; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import parquet.cascading.SchemaIntersection; import parquet.hadoop.Footer; import parquet.hadoop.ParquetInputFormat; import parquet.hadoop.ParquetOutputFormat; import parquet.hadoop.mapred.Container; import parquet.hadoop.mapred.DeprecatedParquetInputFormat; import parquet.hadoop.mapred.DeprecatedParquetOutputFormat; import parquet.schema.MessageType; import java.io.IOException; import java.util.List; /** * A Cascading Scheme that converts Parquet groups into Cascading tuples. If you * provide it with sourceFields, it will selectively materialize only the * columns for those fields. The names must match the names in the Parquet * schema. If you do not provide sourceFields, or use Fields.ALL or * Fields.UNKNOWN, it will create one from the Parquet schema. Currently, only * primitive types are supported. TODO: allow nested fields in the Parquet * schema to be flattened to a top-level field in the Cascading tuple. * */ @SuppressWarnings("rawtypes") public class ParquetTupleScheme extends Scheme<JobConf, RecordReader, OutputCollector, Object[], Object[]> { private static final long serialVersionUID = 0L; private String parquetSchema; public ParquetTupleScheme() { super(); } /** * ParquetTupleScheme constructor used a sink need to be implemented * * @param sourceFields * used for the reading step * @param sinkFields * used for the writing step * @param schema * is mandatory if you add sinkFields and needs to be the * toString() from a MessageType. This value is going to be * parsed when the parquet file will be created. */ public ParquetTupleScheme(Fields sinkFields, String[] columnTypes) { super(sinkFields, sinkFields); parquetSchema = HiveParquetSchemeHelper.getParquetSchemeMessage(getSinkFields(), columnTypes); } @Override public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, DataWritableReadSupport.class); // TupleReadSupport.setRequestedFields(jobConf, getSourceFields()); } @Override public Fields retrieveSourceFields(FlowProcess<? extends JobConf> flowProcess, Tap tap) { MessageType schema = readSchema(flowProcess, tap); SchemaIntersection intersection = new SchemaIntersection(schema, getSourceFields()); setSourceFields(intersection.getSourceFields()); return getSourceFields(); } private MessageType readSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) { try { Hfs hfs; if (tap instanceof CompositeTap) hfs = (Hfs) ((CompositeTap) tap).getChildTaps().next(); else hfs = (Hfs) tap; List<Footer> footers = getFooters(flowProcess, hfs); if (footers.isEmpty()) { throw new TapException("Could not read Parquet metadata at " + hfs.getPath()); } else { return footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); } } catch (IOException e) { throw new TapException(e); } } @SuppressWarnings({ "static-access", "unchecked" }) private List<Footer> getFooters(FlowProcess<? extends JobConf> flowProcess, Hfs hfs) throws IOException { JobConf jobConf = flowProcess.getConfigCopy(); DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat(); format.addInputPath(jobConf, hfs.getPath()); return format.getFooters(jobConf); } @SuppressWarnings("unchecked") @Override public boolean source(FlowProcess<? extends JobConf> fp, SourceCall<Object[], RecordReader> sc) throws IOException { Container<ArrayWritable> value = (Container<ArrayWritable>) sc.getInput().createValue(); boolean hasNext = sc.getInput().next(null, value); if (!hasNext) { return false; } // Skip nulls if (value == null) { return true; } Tuple tuple = WritableFactory.getTuple(value.get()); sc.getIncomingEntry().setTuple(tuple); return true; } @Override public void sinkConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class); jobConf.set(ParquetTupleWriter.PARQUET_CASCADING_SCHEMA, parquetSchema); ParquetOutputFormat.setWriteSupportClass(jobConf, ParquetTupleWriter.class); } @SuppressWarnings("unchecked") @Override public void sink(FlowProcess<? extends JobConf> fp, SinkCall<Object[], OutputCollector> sink) throws IOException { TupleEntry tuple = sink.getOutgoingEntry(); OutputCollector outputCollector = sink.getOutput(); outputCollector.collect(null, tuple); } }