Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.hive; import com.facebook.presto.hadoop.HadoopFileSystemCache; import com.facebook.presto.hadoop.HadoopNative; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.RecordSet; import com.facebook.presto.spi.type.Type; import com.google.common.base.Optional; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.joda.time.DateTimeZone; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.concurrent.Callable; import static com.facebook.presto.hive.HiveColumnHandle.hiveColumnIndexGetter; import static com.facebook.presto.hive.HiveColumnHandle.isPartitionKeyPredicate; import static com.facebook.presto.hive.HiveColumnHandle.nativeTypeGetter; import static com.facebook.presto.hive.HiveUtil.getInputFormat; import static com.facebook.presto.hive.HiveUtil.getInputFormatName; import static com.facebook.presto.hive.HiveUtil.getTableObjectInspector; import static com.facebook.presto.hive.RetryDriver.retry; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Predicates.not; import static com.google.common.collect.Iterables.filter; import static com.google.common.collect.Lists.transform; import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_FORMAT; public class HiveRecordSet implements RecordSet { static { HadoopNative.requireHadoopNative(); HadoopFileSystemCache.initialize(); } private final HiveSplit split; private final List<HiveColumnHandle> columns; private final List<Type> columnTypes; private final List<Integer> readHiveColumnIndexes; private final Path path; private final Configuration configuration; private final List<HiveRecordCursorProvider> cursorProviders; private final DateTimeZone timeZone; public HiveRecordSet(HdfsEnvironment hdfsEnvironment, HiveSplit split, List<HiveColumnHandle> columns, List<HiveRecordCursorProvider> cursorProviders, DateTimeZone timeZone) { this.split = checkNotNull(split, "split is null"); this.columns = ImmutableList.copyOf(checkNotNull(columns, "columns is null")); this.columnTypes = ImmutableList.copyOf(Iterables.transform(columns, nativeTypeGetter())); this.cursorProviders = ImmutableList.copyOf(checkNotNull(cursorProviders, "cursor providers is null")); this.timeZone = checkNotNull(timeZone, "timeZone is null"); // determine which hive columns we will read List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, not(isPartitionKeyPredicate()))); if (readColumns.isEmpty()) { // for count(*) queries we will have "no" columns we want to read, but since hive doesn't // support no columns (it will read all columns instead), we must choose a single column HiveColumnHandle primitiveColumn = getFirstPrimitiveColumn(split.getClientId(), split.getSchema()); readColumns = ImmutableList.of(primitiveColumn); } readHiveColumnIndexes = new ArrayList<>(transform(readColumns, hiveColumnIndexGetter())); this.path = new Path(split.getPath()); this.configuration = hdfsEnvironment.getConfiguration(path); String nullSequence = split.getSchema().getProperty(SERIALIZATION_NULL_FORMAT); checkState(nullSequence == null || nullSequence.equals("\\N"), "Only '\\N' supported as null specifier, was '%s'", nullSequence); } @Override public List<Type> getColumnTypes() { return columnTypes; } @Override public HiveRecordCursor cursor() { // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files ColumnProjectionUtils.setReadColumnIDs(configuration, readHiveColumnIndexes); RecordReader<?, ?> recordReader = createRecordReader(split, configuration, path); for (HiveRecordCursorProvider provider : cursorProviders) { Optional<HiveRecordCursor> cursor = provider.createHiveRecordCursor(split, recordReader, columns, timeZone); if (cursor.isPresent()) { return cursor.get(); } } throw new RuntimeException("Configured cursor providers did not provide a cursor"); } private static HiveColumnHandle getFirstPrimitiveColumn(String clientId, Properties schema) { int index = 0; for (StructField field : getTableObjectInspector(schema).getAllStructFieldRefs()) { if (field.getFieldObjectInspector().getCategory() == ObjectInspector.Category.PRIMITIVE) { PrimitiveObjectInspector inspector = (PrimitiveObjectInspector) field.getFieldObjectInspector(); HiveType hiveType = HiveType.getSupportedHiveType(inspector.getPrimitiveCategory()); return new HiveColumnHandle(clientId, field.getFieldName(), index, hiveType, index, false); } index++; } throw new IllegalStateException("Table doesn't have any PRIMITIVE columns"); } private static RecordReader<?, ?> createRecordReader(HiveSplit split, Configuration configuration, Path wrappedPath) { final InputFormat<?, ?> inputFormat = getInputFormat(configuration, split.getSchema(), true); final JobConf jobConf = new JobConf(configuration); final FileSplit fileSplit = createFileSplit(wrappedPath, split.getStart(), split.getLength()); // propagate serialization configuration to getRecordReader for (String name : split.getSchema().stringPropertyNames()) { if (name.startsWith("serialization.")) { jobConf.set(name, split.getSchema().getProperty(name)); } } try { return retry().stopOnIllegalExceptions().run("createRecordReader", new Callable<RecordReader<?, ?>>() { @Override public RecordReader<?, ?> call() throws IOException { return inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); } }); } catch (Exception e) { throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT.toErrorCode(), String.format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", split.getPath(), split.getStart(), split.getLength(), getInputFormatName(split.getSchema()), e.getMessage()), e); } } private static FileSplit createFileSplit(final Path path, long start, long length) { return new FileSplit(path, start, length, (String[]) null) { @Override public Path getPath() { // make sure our original path object is returned return path; } }; } }