Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.blm.orc; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.ql.io.orc.OrcProto; import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.*; /** * Contains factory methods to read or write ORC files. */ public final class OrcFile { public static final String MAGIC = "ORC"; /** * Create a version number for the ORC file format, so that we can add * non-forward compatible changes in the future. To make it easier for users * to understand the version numbers, we use the Hive release number that * first wrote that version of ORC files. * * Thus, if you add new encodings or other non-forward compatible changes * to ORC files, which prevent the old reader from reading the new format, * you should change these variable to reflect the next Hive release number. * Non-forward compatible changes should never be added in patch releases. * * Do not make any changes that break backwards compatibility, which would * prevent the new reader from reading ORC files generated by any released * version of Hive. */ public static enum Version { V_0_11("0.11", 0, 11), V_0_12("0.12", 0, 12); public static final Version CURRENT = V_0_12; private final String name; private final int major; private final int minor; private Version(String name, int major, int minor) { this.name = name; this.major = major; this.minor = minor; } public static Version byName(String name) { for (Version version : values()) { if (version.name.equals(name)) { return version; } } throw new IllegalArgumentException("Unknown ORC version " + name); } /** * Get the human readable name for the version. */ public String getName() { return name; } /** * Get the major version number. */ public int getMajor() { return major; } /** * Get the minor version number. */ public int getMinor() { return minor; } } /** * Records the version of the writer in terms of which bugs have been fixed. * For bugs in the writer, but the old readers already read the new data * correctly, bump this version instead of the Version. */ public static enum WriterVersion { ORIGINAL(0), HIVE_8732(1); // corrupted stripe/file maximum column statistics private final int id; public int getId() { return id; } private WriterVersion(int id) { this.id = id; } } public static enum EncodingStrategy { SPEED, COMPRESSION; } public static enum CompressionStrategy { SPEED, COMPRESSION; } // Note : these string definitions for table properties are deprecated, // and retained only for backward compatibility, please do not add to // them, add to OrcTableProperties below instead @Deprecated public static final String COMPRESSION = "orc.compress"; @Deprecated public static final String COMPRESSION_BLOCK_SIZE = "orc.compress.size"; @Deprecated public static final String STRIPE_SIZE = "orc.stripe.size"; @Deprecated public static final String ROW_INDEX_STRIDE = "orc.row.index.stride"; @Deprecated public static final String ENABLE_INDEXES = "orc.create.index"; @Deprecated public static final String BLOCK_PADDING = "orc.block.padding"; /** * Enum container for all orc table properties. * If introducing a new orc-specific table property, * add it here. */ public static enum OrcTableProperties { COMPRESSION("orc.compress"), COMPRESSION_BLOCK_SIZE("orc.compress.size"), STRIPE_SIZE( "orc.stripe.size"), BLOCK_SIZE("orc.block.size"), ROW_INDEX_STRIDE( "orc.row.index.stride"), ENABLE_INDEXES("orc.create.index"), BLOCK_PADDING( "orc.block.padding"), ENCODING_STRATEGY("orc.encoding.strategy"); private final String propName; OrcTableProperties(String propName) { this.propName = propName; } public String getPropName() { return this.propName; } } // unused private OrcFile() { } /** * Create an ORC file reader. * @param fs file system * @param path file name to read from * @return a new ORC file reader. * @throws IOException */ public static Reader createReader(FileSystem fs, Path path) throws IOException { ReaderOptions opts = new ReaderOptions(new Configuration()); opts.filesystem(fs); return new ReaderImpl(path, opts); } public static class ReaderOptions { private final Configuration conf; private FileSystem filesystem; private ReaderImpl.FileMetaInfo fileMetaInfo; private long maxLength = Long.MAX_VALUE; ReaderOptions(Configuration conf) { this.conf = conf; } ReaderOptions fileMetaInfo(ReaderImpl.FileMetaInfo info) { fileMetaInfo = info; return this; } public ReaderOptions filesystem(FileSystem fs) { this.filesystem = fs; return this; } public ReaderOptions maxLength(long val) { maxLength = val; return this; } Configuration getConfiguration() { return conf; } FileSystem getFilesystem() { return filesystem; } ReaderImpl.FileMetaInfo getFileMetaInfo() { return fileMetaInfo; } long getMaxLength() { return maxLength; } } public static ReaderOptions readerOptions(Configuration conf) { return new ReaderOptions(conf); } public static Reader createReader(Path path, ReaderOptions options) throws IOException { return new ReaderImpl(path, options); } public static interface WriterContext { Writer getWriter(); } public static interface WriterCallback { public void preStripeWrite(WriterContext context) throws IOException; public void preFooterWrite(WriterContext context) throws IOException; } /** * Options for creating ORC file writers. */ public static class WriterOptions { private final Configuration configuration; private FileSystem fileSystemValue = null; private ObjectInspector inspectorValue = null; private long stripeSizeValue; private long blockSizeValue; private int rowIndexStrideValue; private int bufferSizeValue; private boolean blockPaddingValue; private CompressionKind compressValue; private MemoryManager memoryManagerValue; private Version versionValue; private WriterCallback callback; private EncodingStrategy encodingStrategy; private CompressionStrategy compressionStrategy; private float paddingTolerance; WriterOptions(Configuration conf) { configuration = conf; memoryManagerValue = getMemoryManager(conf); stripeSizeValue = HiveConf.getLongVar(conf, HIVE_ORC_DEFAULT_STRIPE_SIZE); blockSizeValue = HiveConf.getLongVar(conf, HIVE_ORC_DEFAULT_BLOCK_SIZE); rowIndexStrideValue = HiveConf.getIntVar(conf, HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE); bufferSizeValue = HiveConf.getIntVar(conf, HIVE_ORC_DEFAULT_BUFFER_SIZE); blockPaddingValue = HiveConf.getBoolVar(conf, HIVE_ORC_DEFAULT_BLOCK_PADDING); compressValue = CompressionKind.valueOf(HiveConf.getVar(conf, HIVE_ORC_DEFAULT_COMPRESS)); String versionName = HiveConf.getVar(conf, HIVE_ORC_WRITE_FORMAT); if (versionName == null) { versionValue = Version.CURRENT; } else { versionValue = Version.byName(versionName); } String enString = conf.get(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname); if (enString == null) { encodingStrategy = EncodingStrategy.SPEED; } else { encodingStrategy = EncodingStrategy.valueOf(enString); } String compString = conf.get(HiveConf.ConfVars.HIVE_ORC_COMPRESSION_STRATEGY.varname); if (compString == null) { compressionStrategy = CompressionStrategy.SPEED; } else { compressionStrategy = CompressionStrategy.valueOf(compString); } paddingTolerance = conf.getFloat(HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname, HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal); } /** * Provide the filesystem for the path, if the client has it available. * If it is not provided, it will be found from the path. */ public WriterOptions fileSystem(FileSystem value) { fileSystemValue = value; return this; } /** * Set the stripe size for the file. The writer stores the contents of the * stripe in memory until this memory limit is reached and the stripe * is flushed to the HDFS file and the next stripe started. */ public WriterOptions stripeSize(long value) { stripeSizeValue = value; return this; } /** * Set the file system block size for the file. For optimal performance, * set the block size to be multiple factors of stripe size. */ public WriterOptions blockSize(long value) { blockSizeValue = value; return this; } /** * Set the distance between entries in the row index. The minimum value is * 1000 to prevent the index from overwhelming the data. If the stride is * set to 0, no indexes will be included in the file. */ public WriterOptions rowIndexStride(int value) { rowIndexStrideValue = value; return this; } /** * The size of the memory buffers used for compressing and storing the * stripe in memory. */ public WriterOptions bufferSize(int value) { bufferSizeValue = value; return this; } /** * Sets whether the HDFS blocks are padded to prevent stripes from * straddling blocks. Padding improves locality and thus the speed of * reading, but costs space. */ public WriterOptions blockPadding(boolean value) { blockPaddingValue = value; return this; } /** * Sets the encoding strategy that is used to encode the data. */ public WriterOptions encodingStrategy(EncodingStrategy strategy) { encodingStrategy = strategy; return this; } /** * Sets the tolerance for block padding as a percentage of stripe size. */ public WriterOptions paddingTolerance(float value) { paddingTolerance = value; return this; } /** * Sets the generic compression that is used to compress the data. */ public WriterOptions compress(CompressionKind value) { compressValue = value; return this; } /** * A required option that sets the object inspector for the rows. Used * to determine the schema for the file. */ public WriterOptions inspector(ObjectInspector value) { inspectorValue = value; return this; } /** * Sets the version of the file that will be written. */ public WriterOptions version(Version value) { versionValue = value; return this; } /** * Add a listener for when the stripe and file are about to be closed. * @param callback the object to be called when the stripe is closed * @return */ public WriterOptions callback(WriterCallback callback) { this.callback = callback; return this; } /** * A package local option to set the memory manager. */ WriterOptions memory(MemoryManager value) { memoryManagerValue = value; return this; } } /** * Create a default set of write options that can be modified. */ public static WriterOptions writerOptions(Configuration conf) { return new WriterOptions(conf); } /** * Create an ORC file writer. This is the public interface for creating * writers going forward and new options will only be added to this method. * @param path filename to write to * @param opts the options * @return a new ORC file writer * @throws IOException */ public static Writer createWriter(Path path, WriterOptions opts) throws IOException { FileSystem fs = opts.fileSystemValue == null ? path.getFileSystem(opts.configuration) : opts.fileSystemValue; return new WriterImpl(fs, path, opts.configuration, opts.inspectorValue, opts.stripeSizeValue, opts.compressValue, opts.bufferSizeValue, opts.rowIndexStrideValue, opts.memoryManagerValue, opts.blockPaddingValue, opts.versionValue, opts.callback, opts.encodingStrategy, opts.compressionStrategy, opts.paddingTolerance, opts.blockSizeValue); } /** * Create an ORC file writer. This method is provided for API backward * compatability with Hive 0.11. * @param fs file system * @param path filename to write to * @param inspector the ObjectInspector that inspects the rows * @param stripeSize the number of bytes in a stripe * @param compress how to compress the file * @param bufferSize the number of bytes to compress at once * @param rowIndexStride the number of rows between row index entries or * 0 to suppress all indexes * @return a new ORC file writer * @throws IOException */ public static Writer createWriter(FileSystem fs, Path path, Configuration conf, ObjectInspector inspector, long stripeSize, CompressionKind compress, int bufferSize, int rowIndexStride) throws IOException { return createWriter(path, writerOptions(conf).fileSystem(fs).inspector(inspector).stripeSize(stripeSize) .compress(compress).bufferSize(bufferSize).rowIndexStride(rowIndexStride)); } private static MemoryManager memoryManager = null; private static synchronized MemoryManager getMemoryManager(Configuration conf) { if (memoryManager == null) { memoryManager = new MemoryManager(conf); } return memoryManager; } }