Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.orc.mapred; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.BinaryComparable; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.ShortWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reporter; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; public class OrcMapredRecordWriter<V extends Writable> implements RecordWriter<NullWritable, V> { private final Writer writer; private final VectorizedRowBatch batch; private final TypeDescription schema; private final boolean isTopStruct; public OrcMapredRecordWriter(Writer writer) { this.writer = writer; schema = writer.getSchema(); this.batch = schema.createRowBatch(); isTopStruct = schema.getCategory() == TypeDescription.Category.STRUCT; } static void setLongValue(ColumnVector vector, int row, long value) { ((LongColumnVector) vector).vector[row] = value; } static void setDoubleValue(ColumnVector vector, int row, double value) { ((DoubleColumnVector) vector).vector[row] = value; } static void setBinaryValue(ColumnVector vector, int row, BinaryComparable value) { ((BytesColumnVector) vector).setVal(row, value.getBytes(), 0, value.getLength()); } static void setBinaryValue(ColumnVector vector, int row, BinaryComparable value, int maxLength) { ((BytesColumnVector) vector).setVal(row, value.getBytes(), 0, Math.min(maxLength, value.getLength())); } private static final ThreadLocal<byte[]> SPACE_BUFFER = new ThreadLocal<byte[]>() { @Override protected byte[] initialValue() { byte[] result = new byte[100]; Arrays.fill(result, (byte) ' '); return result; } }; static void setCharValue(BytesColumnVector vector, int row, Text value, int length) { // we need to trim or pad the string with spaces to required length int actualLength = value.getLength(); if (actualLength >= length) { setBinaryValue(vector, row, value, length); } else { byte[] spaces = SPACE_BUFFER.get(); if (length - actualLength > spaces.length) { spaces = new byte[length - actualLength]; Arrays.fill(spaces, (byte) ' '); SPACE_BUFFER.set(spaces); } vector.setConcat(row, value.getBytes(), 0, actualLength, spaces, 0, length - actualLength); } } static void setStructValue(TypeDescription schema, StructColumnVector vector, int row, OrcStruct value) { List<TypeDescription> children = schema.getChildren(); for (int c = 0; c < value.getNumFields(); ++c) { setColumn(children.get(c), vector.fields[c], row, value.getFieldValue(c)); } } static void setUnionValue(TypeDescription schema, UnionColumnVector vector, int row, OrcUnion value) { List<TypeDescription> children = schema.getChildren(); int tag = value.getTag() & 0xff; vector.tags[row] = tag; setColumn(children.get(tag), vector.fields[tag], row, value.getObject()); } static void setListValue(TypeDescription schema, ListColumnVector vector, int row, OrcList value) { TypeDescription elemType = schema.getChildren().get(0); vector.offsets[row] = vector.childCount; vector.lengths[row] = value.size(); vector.childCount += vector.lengths[row]; vector.child.ensureSize(vector.childCount, vector.offsets[row] != 0); for (int e = 0; e < vector.lengths[row]; ++e) { setColumn(elemType, vector.child, (int) vector.offsets[row] + e, (Writable) value.get(e)); } } static void setMapValue(TypeDescription schema, MapColumnVector vector, int row, OrcMap<?, ?> value) { TypeDescription keyType = schema.getChildren().get(0); TypeDescription valueType = schema.getChildren().get(1); vector.offsets[row] = vector.childCount; vector.lengths[row] = value.size(); vector.childCount += vector.lengths[row]; vector.keys.ensureSize(vector.childCount, vector.offsets[row] != 0); vector.values.ensureSize(vector.childCount, vector.offsets[row] != 0); int e = 0; for (Map.Entry<?, ?> entry : value.entrySet()) { setColumn(keyType, vector.keys, (int) vector.offsets[row] + e, (Writable) entry.getKey()); setColumn(valueType, vector.values, (int) vector.offsets[row] + e, (Writable) entry.getValue()); e += 1; } } public static void setColumn(TypeDescription schema, ColumnVector vector, int row, Writable value) { if (value == null) { vector.noNulls = false; vector.isNull[row] = true; } else { switch (schema.getCategory()) { case BOOLEAN: setLongValue(vector, row, ((BooleanWritable) value).get() ? 1 : 0); break; case BYTE: setLongValue(vector, row, ((ByteWritable) value).get()); break; case SHORT: setLongValue(vector, row, ((ShortWritable) value).get()); break; case INT: setLongValue(vector, row, ((IntWritable) value).get()); break; case LONG: setLongValue(vector, row, ((LongWritable) value).get()); break; case FLOAT: setDoubleValue(vector, row, ((FloatWritable) value).get()); break; case DOUBLE: setDoubleValue(vector, row, ((DoubleWritable) value).get()); break; case STRING: setBinaryValue(vector, row, (Text) value); break; case CHAR: setCharValue((BytesColumnVector) vector, row, (Text) value, schema.getMaxLength()); break; case VARCHAR: setBinaryValue(vector, row, (Text) value, schema.getMaxLength()); break; case BINARY: setBinaryValue(vector, row, (BytesWritable) value); break; case DATE: setLongValue(vector, row, ((DateWritable) value).getDays()); break; case TIMESTAMP: ((TimestampColumnVector) vector).set(row, (OrcTimestamp) value); break; case DECIMAL: ((DecimalColumnVector) vector).set(row, (HiveDecimalWritable) value); break; case STRUCT: setStructValue(schema, (StructColumnVector) vector, row, (OrcStruct) value); break; case UNION: setUnionValue(schema, (UnionColumnVector) vector, row, (OrcUnion) value); break; case LIST: setListValue(schema, (ListColumnVector) vector, row, (OrcList) value); break; case MAP: setMapValue(schema, (MapColumnVector) vector, row, (OrcMap) value); break; default: throw new IllegalArgumentException("Unknown type " + schema); } } } @Override public void write(NullWritable nullWritable, V v) throws IOException { // if the batch is full, write it out. if (batch.size == batch.getMaxSize()) { writer.addRowBatch(batch); batch.reset(); } // add the new row int row = batch.size++; // skip over the OrcKey or OrcValue if (v instanceof OrcKey) { v = (V) ((OrcKey) v).key; } else if (v instanceof OrcValue) { v = (V) ((OrcValue) v).value; } if (isTopStruct) { for (int f = 0; f < schema.getChildren().size(); ++f) { setColumn(schema.getChildren().get(f), batch.cols[f], row, ((OrcStruct) v).getFieldValue(f)); } } else { setColumn(schema, batch.cols[0], row, v); } } @Override public void close(Reporter reporter) throws IOException { if (batch.size != 0) { writer.addRowBatch(batch); batch.reset(); } writer.close(); } }