List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:com.facebook.presto.hive.DwrfRecordCursorProvider.java
License:Apache License
@Override public Optional<HiveRecordCursor> createHiveRecordCursor(String clientId, Configuration configuration, ConnectorSession session, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, List<HivePartitionKey> partitionKeys, TupleDomain<HiveColumnHandle> tupleDomain, DateTimeZone hiveStorageTimeZone, TypeManager typeManager) { @SuppressWarnings("deprecation") Deserializer deserializer = getDeserializer(schema); if (!(deserializer instanceof OrcSerde)) { return Optional.absent(); }//from ww w .j a va 2 s . co m StructObjectInspector rowInspector = getTableObjectInspector(schema); if (!all(rowInspector.getAllStructFieldRefs(), isSupportedDwrfType())) { throw new IllegalArgumentException("DWRF does not support DATE type"); } ReaderWriterProfiler.setProfilerOptions(configuration); RecordReader recordReader; try { FileSystem fileSystem = path.getFileSystem(configuration); Reader reader = OrcFile.createReader(fileSystem, path, new JobConf(configuration)); boolean[] include = findIncludedColumns(reader.getTypes(), columns); recordReader = reader.rows(start, length, include); } catch (Exception e) { throw Throwables.propagate(e); } return Optional.<HiveRecordCursor>of(new DwrfHiveRecordCursor(recordReader, length, schema, partitionKeys, columns, hiveStorageTimeZone, DateTimeZone.forID(session.getTimeZoneKey().getId()), typeManager)); }
From source file:com.facebook.presto.hive.HiveClient.java
License:Apache License
@Override public RecordSink getRecordSink(ConnectorOutputTableHandle tableHandle) { HiveOutputTableHandle handle = checkType(tableHandle, HiveOutputTableHandle.class, "tableHandle"); Path target = new Path(handle.getTemporaryPath(), randomUUID().toString()); JobConf conf = new JobConf(hdfsEnvironment.getConfiguration(target)); return new HiveRecordSink(handle, target, conf); }
From source file:com.facebook.presto.hive.HiveMetadata.java
License:Apache License
private List<String> computeFileNamesForMissingBuckets(HiveStorageFormat storageFormat, Path targetPath, String filePrefix, int bucketCount, PartitionUpdate partitionUpdate) { if (partitionUpdate.getFileNames().size() == bucketCount) { // fast path for common case return ImmutableList.of(); }/*from www . ja va 2s .c om*/ JobConf conf = new JobConf(hdfsEnvironment.getConfiguration(targetPath)); String fileExtension = HiveWriterFactory.getFileExtension(conf, fromHiveStorageFormat(storageFormat)); Set<String> fileNames = partitionUpdate.getFileNames().stream().collect(Collectors.toSet()); ImmutableList.Builder<String> missingFileNamesBuilder = ImmutableList.builder(); for (int i = 0; i < bucketCount; i++) { String fileName = HiveWriterFactory.computeBucketedFileName(filePrefix, i) + fileExtension; if (!fileNames.contains(fileName)) { missingFileNamesBuilder.add(fileName); } } List<String> missingFileNames = missingFileNamesBuilder.build(); verify(fileNames.size() + missingFileNames.size() == bucketCount); return missingFileNames; }
From source file:com.facebook.presto.hive.HiveMetadata.java
License:Apache License
private void createEmptyFile(Path path, Table table, Optional<Partition> partition, List<String> fileNames) { JobConf conf = new JobConf(hdfsEnvironment.getConfiguration(path)); Properties schema;/*from w w w . j a va 2 s .c o m*/ StorageFormat format; if (partition.isPresent()) { schema = getHiveSchema(partition.get(), table); format = partition.get().getStorage().getStorageFormat(); } else { schema = getHiveSchema(table); format = table.getStorage().getStorageFormat(); } for (String fileName : fileNames) { writeEmptyFile(new Path(path, fileName), conf, schema, format.getSerDe(), format.getOutputFormat()); } }
From source file:com.facebook.presto.hive.HivePageSink.java
License:Apache License
public HivePageSink(String schemaName, String tableName, boolean isCreateTable, List<HiveColumnHandle> inputColumns, HiveStorageFormat tableStorageFormat, LocationHandle locationHandle, LocationService locationService, String filePrefix, HiveMetastore metastore, PageIndexerFactory pageIndexerFactory, TypeManager typeManager, HdfsEnvironment hdfsEnvironment, boolean respectTableFormat, int maxOpenPartitions, boolean immutablePartitions, JsonCodec<PartitionUpdate> partitionUpdateCodec) { this.schemaName = requireNonNull(schemaName, "schemaName is null"); this.tableName = requireNonNull(tableName, "tableName is null"); requireNonNull(inputColumns, "inputColumns is null"); this.tableStorageFormat = requireNonNull(tableStorageFormat, "tableStorageFormat is null"); this.locationHandle = requireNonNull(locationHandle, "locationHandle is null"); this.locationService = requireNonNull(locationService, "locationService is null"); this.filePrefix = requireNonNull(filePrefix, "filePrefix is null"); this.metastore = requireNonNull(metastore, "metastore is null"); requireNonNull(pageIndexerFactory, "pageIndexerFactory is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.respectTableFormat = respectTableFormat; this.maxOpenPartitions = maxOpenPartitions; this.immutablePartitions = immutablePartitions; this.partitionUpdateCodec = requireNonNull(partitionUpdateCodec, "partitionUpdateCodec is null"); // divide input columns into partition and data columns ImmutableList.Builder<String> partitionColumnNames = ImmutableList.builder(); ImmutableList.Builder<Type> partitionColumnTypes = ImmutableList.builder(); ImmutableList.Builder<String> dataColumnNames = ImmutableList.builder(); ImmutableList.Builder<Type> dataColumnTypes = ImmutableList.builder(); for (HiveColumnHandle column : inputColumns) { if (column.isPartitionKey()) { partitionColumnNames.add(column.getName()); partitionColumnTypes.add(typeManager.getType(column.getTypeSignature())); } else {//from w w w . ja va2 s. c om dataColumnNames.add(column.getName()); dataColumnTypes.add(typeManager.getType(column.getTypeSignature())); } } this.partitionColumnNames = partitionColumnNames.build(); this.partitionColumnTypes = partitionColumnTypes.build(); this.dataColumnNames = dataColumnNames.build(); this.dataColumnTypes = dataColumnTypes.build(); // determine the input index of the partition columns and data columns ImmutableList.Builder<Integer> partitionColumns = ImmutableList.builder(); ImmutableList.Builder<Integer> dataColumns = ImmutableList.builder(); // sample weight column is passed separately, so index must be calculated without this column List<HiveColumnHandle> inputColumnsWithoutSample = inputColumns.stream() .filter(column -> !column.getName().equals(SAMPLE_WEIGHT_COLUMN_NAME)).collect(toList()); for (int inputIndex = 0; inputIndex < inputColumnsWithoutSample.size(); inputIndex++) { HiveColumnHandle column = inputColumnsWithoutSample.get(inputIndex); if (column.isPartitionKey()) { partitionColumns.add(inputIndex); } else { dataColumns.add(inputIndex); } } this.partitionColumns = Ints.toArray(partitionColumns.build()); this.dataColumns = Ints.toArray(dataColumns.build()); this.pageIndexer = pageIndexerFactory.createPageIndexer(this.partitionColumnTypes); // preallocate temp space for partition and data this.partitionRow = Arrays.asList(new Object[this.partitionColumnNames.size()]); this.dataRow = Arrays.asList(new Object[this.dataColumnNames.size()]); if (isCreateTable) { this.table = null; Optional<Path> writePath = locationService.writePathRoot(locationHandle); checkArgument(writePath.isPresent(), "CREATE TABLE must have a write path"); conf = new JobConf(hdfsEnvironment.getConfiguration(writePath.get())); } else { Optional<Table> table = metastore.getTable(schemaName, tableName); if (!table.isPresent()) { throw new PrestoException(HIVE_INVALID_METADATA, format("Table %s.%s was dropped during insert", schemaName, tableName)); } this.table = table.get(); Path hdfsEnvironmentPath = locationService.writePathRoot(locationHandle) .orElseGet(() -> locationService.targetPathRoot(locationHandle)); conf = new JobConf(hdfsEnvironment.getConfiguration(hdfsEnvironmentPath)); } }
From source file:com.facebook.presto.hive.HiveRecordSet.java
License:Apache License
private static RecordReader<?, ?> createRecordReader(HiveSplit split, Configuration configuration, Path wrappedPath) {/*from w w w . java 2 s. co m*/ final InputFormat<?, ?> inputFormat = getInputFormat(configuration, split.getSchema(), true); final JobConf jobConf = new JobConf(configuration); final FileSplit fileSplit = createFileSplit(wrappedPath, split.getStart(), split.getLength()); // propagate serialization configuration to getRecordReader for (String name : split.getSchema().stringPropertyNames()) { if (name.startsWith("serialization.")) { jobConf.set(name, split.getSchema().getProperty(name)); } } try { return retry().stopOnIllegalExceptions().run("createRecordReader", new Callable<RecordReader<?, ?>>() { @Override public RecordReader<?, ?> call() throws IOException { return inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); } }); } catch (Exception e) { throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT.toErrorCode(), String.format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", split.getPath(), split.getStart(), split.getLength(), getInputFormatName(split.getSchema()), e.getMessage()), e); } }
From source file:com.facebook.presto.hive.HiveSplitIterable.java
License:Apache License
private void loadPartitionSplits(final HiveSplitQueue hiveSplitQueue, SuspendingExecutor suspendingExecutor) throws InterruptedException { final Semaphore semaphore = new Semaphore(maxPartitionBatchSize); try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder(); Iterator<String> nameIterator = partitionNames.iterator(); for (Partition partition : partitions) { checkState(nameIterator.hasNext(), "different number of partitions and partition names!"); semaphore.acquire();//from w w w . ja v a 2s.c o m final String partitionName = nameIterator.next(); final Properties schema = getPartitionSchema(table, partition); final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition); Path path = new Path(getPartitionLocation(table, partition)); final Configuration configuration = hdfsEnvironment.getConfiguration(path); final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); Path partitionPath = hdfsEnvironment.getFileSystemWrapper().wrap(path); FileSystem fs = partitionPath.getFileSystem(configuration); final LastSplitMarkingQueue markerQueue = new LastSplitMarkingQueue(hiveSplitQueue); if (inputFormat instanceof SymlinkTextInputFormat) { JobConf jobConf = new JobConf(configuration); FileInputFormat.setInputPaths(jobConf, partitionPath); InputSplit[] splits = inputFormat.getSplits(jobConf, 0); for (InputSplit rawSplit : splits) { FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit) .getTargetSplit(); // get the filesystem for the target path -- it may be a different hdfs instance FileSystem targetFilesystem = split.getPath().getFileSystem(configuration); FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath()); markerQueue.addToQueue(createHiveSplits(partitionName, fileStatus, targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false)); } markerQueue.finish(); continue; } ListenableFuture<Void> partitionFuture = new AsyncRecursiveWalker(fs, suspendingExecutor) .beginWalk(partitionPath, new FileStatusCallback() { @Override public void process(FileStatus file, BlockLocation[] blockLocations) { if (bucket.isPresent() && !fileMatchesBucket(file.getPath().getName(), bucket.get())) { return; } try { boolean splittable = isSplittable(inputFormat, file.getPath().getFileSystem(configuration), file.getPath()); markerQueue.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable)); } catch (IOException e) { hiveSplitQueue.fail(e); } } }); // release the semaphore when the partition finishes Futures.addCallback(partitionFuture, new FutureCallback<Void>() { @Override public void onSuccess(Void result) { markerQueue.finish(); semaphore.release(); } @Override public void onFailure(Throwable t) { markerQueue.finish(); semaphore.release(); } }); futureBuilder.add(partitionFuture); } // when all partitions finish, mark the queue as finished Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() { @Override public void onSuccess(List<Void> result) { hiveSplitQueue.finished(); } @Override public void onFailure(Throwable t) { hiveSplitQueue.fail(t); } }); } catch (Throwable e) { hiveSplitQueue.fail(e); Throwables.propagateIfInstanceOf(e, Error.class); } }
From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java
License:Apache License
private void loadPartitionSplits(final HiveSplitSource hiveSplitSource, SuspendingExecutor suspendingExecutor, final ConnectorSession session) { final Semaphore semaphore = new Semaphore(maxPartitionBatchSize); try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { ImmutableList.Builder<ListenableFuture<Void>> futureBuilder = ImmutableList.builder(); Iterator<String> nameIterator = partitionNames.iterator(); for (Partition partition : partitions) { checkState(nameIterator.hasNext(), "different number of partitions and partition names!"); final String partitionName = nameIterator.next(); final Properties schema = getPartitionSchema(table, partition); final List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition); Path path = new Path(getPartitionLocation(table, partition)); final Configuration configuration = hdfsEnvironment.getConfiguration(path); final InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = path.getFileSystem(configuration); if (inputFormat instanceof SymlinkTextInputFormat) { JobConf jobConf = new JobConf(configuration); FileInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = inputFormat.getSplits(jobConf, 0); for (InputSplit rawSplit : splits) { FileSplit split = ((SymlinkTextInputFormat.SymlinkTextInputSplit) rawSplit) .getTargetSplit(); // get the filesystem for the target path -- it may be a different hdfs instance FileSystem targetFilesystem = split.getPath().getFileSystem(configuration); FileStatus fileStatus = targetFilesystem.getFileStatus(split.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, fileStatus, targetFilesystem.getFileBlockLocations(fileStatus, split.getStart(), split.getLength()), split.getStart(), split.getLength(), schema, partitionKeys, false, session)); }//from w w w .ja v a2s. com continue; } // TODO: this is currently serial across all partitions and should be done in suspendingExecutor if (bucket.isPresent()) { Optional<FileStatus> bucketFile = getBucketFile(bucket.get(), fs, path); if (bucketFile.isPresent()) { FileStatus file = bucketFile.get(); BlockLocation[] blockLocations = fs.getFileBlockLocations(file, 0, file.getLen()); boolean splittable = isSplittable(inputFormat, fs, file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable, session)); continue; } } // Acquire semaphore so that we only have a fixed number of outstanding partitions being processed asynchronously // NOTE: there must not be any calls that throw in the space between acquiring the semaphore and setting the Future // callback to release it. Otherwise, we will need a try-finally block around this section. try { semaphore.acquire(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return; } ListenableFuture<Void> partitionFuture = createAsyncWalker(fs, suspendingExecutor).beginWalk(path, new FileStatusCallback() { @Override public void process(FileStatus file, BlockLocation[] blockLocations) { try { boolean splittable = isSplittable(inputFormat, file.getPath().getFileSystem(configuration), file.getPath()); hiveSplitSource.addToQueue(createHiveSplits(partitionName, file, blockLocations, 0, file.getLen(), schema, partitionKeys, splittable, session)); } catch (IOException e) { hiveSplitSource.fail(e); } } }); // release the semaphore when the partition finishes Futures.addCallback(partitionFuture, new FutureCallback<Void>() { @Override public void onSuccess(Void result) { semaphore.release(); } @Override public void onFailure(Throwable t) { semaphore.release(); } }); futureBuilder.add(partitionFuture); } // when all partitions finish, mark the queue as finished Futures.addCallback(Futures.allAsList(futureBuilder.build()), new FutureCallback<List<Void>>() { @Override public void onSuccess(List<Void> result) { hiveSplitSource.finished(); } @Override public void onFailure(Throwable t) { hiveSplitSource.fail(t); } }); } catch (Throwable e) { hiveSplitSource.fail(e); Throwables.propagateIfInstanceOf(e, Error.class); } }
From source file:com.facebook.presto.hive.HiveUtil.java
License:Apache License
static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget) { String inputFormatName = getInputFormatName(schema); try {// www . j a v a2s .co m JobConf jobConf = new JobConf(configuration); Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName); if (symlinkTarget && (inputFormatClass == SymlinkTextInputFormat.class)) { // symlink targets are always TextInputFormat inputFormatClass = TextInputFormat.class; } return ReflectionUtils.newInstance(inputFormatClass, jobConf); } catch (ClassNotFoundException | RuntimeException e) { throw new RuntimeException("Unable to create input format " + inputFormatName, e); } }
From source file:com.facebook.presto.hive.HiveWriterFactory.java
License:Apache License
public HiveWriterFactory(String schemaName, String tableName, boolean isCreateTable, List<HiveColumnHandle> inputColumns, HiveStorageFormat tableStorageFormat, HiveStorageFormat partitionStorageFormat, OptionalInt bucketCount, LocationHandle locationHandle, LocationService locationService, String filePrefix, HivePageSinkMetadataProvider pageSinkMetadataProvider, TypeManager typeManager, HdfsEnvironment hdfsEnvironment, boolean immutablePartitions, ConnectorSession session) { this.schemaName = requireNonNull(schemaName, "schemaName is null"); this.tableName = requireNonNull(tableName, "tableName is null"); this.tableStorageFormat = requireNonNull(tableStorageFormat, "tableStorageFormat is null"); this.partitionStorageFormat = requireNonNull(partitionStorageFormat, "partitionStorageFormat is null"); this.locationHandle = requireNonNull(locationHandle, "locationHandle is null"); this.locationService = requireNonNull(locationService, "locationService is null"); this.filePrefix = requireNonNull(filePrefix, "filePrefix is null"); this.pageSinkMetadataProvider = requireNonNull(pageSinkMetadataProvider, "pageSinkMetadataProvider is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); this.immutablePartitions = immutablePartitions; // divide input columns into partition and data columns requireNonNull(inputColumns, "inputColumns is null"); ImmutableList.Builder<String> partitionColumnNames = ImmutableList.builder(); ImmutableList.Builder<Type> partitionColumnTypes = ImmutableList.builder(); ImmutableList.Builder<DataColumn> dataColumns = ImmutableList.builder(); for (HiveColumnHandle column : inputColumns) { HiveType hiveType = column.getHiveType(); if (column.isPartitionKey()) { partitionColumnNames.add(column.getName()); partitionColumnTypes.add(typeManager.getType(column.getTypeSignature())); } else {/*from w w w.j av a 2 s .com*/ dataColumns.add( new DataColumn(column.getName(), typeManager.getType(column.getTypeSignature()), hiveType)); } } this.partitionColumnNames = partitionColumnNames.build(); this.partitionColumnTypes = partitionColumnTypes.build(); this.dataColumns = dataColumns.build(); if (isCreateTable) { this.table = null; Optional<Path> writePath = locationService.writePathRoot(locationHandle); checkArgument(writePath.isPresent(), "CREATE TABLE must have a write path"); conf = new JobConf(hdfsEnvironment.getConfiguration(writePath.get())); } else { Optional<Table> table = pageSinkMetadataProvider.getTable(); if (!table.isPresent()) { throw new PrestoException(HIVE_INVALID_METADATA, format("Table %s.%s was dropped during insert", schemaName, tableName)); } this.table = table.get(); Path hdfsEnvironmentPath = locationService.writePathRoot(locationHandle) .orElseGet(() -> locationService.targetPathRoot(locationHandle)); conf = new JobConf(hdfsEnvironment.getConfiguration(hdfsEnvironmentPath)); } this.bucketCount = requireNonNull(bucketCount, "bucketCount is null"); if (bucketCount.isPresent()) { checkArgument(bucketCount.getAsInt() < MAX_BUCKET_COUNT, "bucketCount must be smaller than " + MAX_BUCKET_COUNT); } this.session = requireNonNull(session, "session is null"); }