List of usage examples for org.apache.hadoop.fs Path getPathWithoutSchemeAndAuthority
public static Path getPathWithoutSchemeAndAuthority(Path path)
From source file:org.apache.drill.exec.store.ImplicitColumnExplorer.java
License:Apache License
/** * Compares selection root and actual file path to determine partition columns values. * Adds implicit file columns according to columns list. * * @return map with columns names as keys and their values *///from ww w .j a v a 2s . com public Map<String, String> populateImplicitColumns(FileWork work, String selectionRoot) { Map<String, String> implicitValues = Maps.newLinkedHashMap(); if (selectionRoot != null) { String[] r = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot)).toString().split("/"); Path path = Path.getPathWithoutSchemeAndAuthority(new Path(work.getPath())); String[] p = path.toString().split("/"); if (p.length > r.length) { String[] q = ArrayUtils.subarray(p, r.length, p.length - 1); for (int a = 0; a < q.length; a++) { if (isStarQuery || selectedPartitionColumns.contains(a)) { implicitValues.put(partitionDesignator + a, q[a]); } } } //add implicit file columns for (Map.Entry<String, ImplicitFileColumns> entry : selectedImplicitColumns.entrySet()) { implicitValues.put(entry.getKey(), entry.getValue().getValue(path)); } } return implicitValues; }
From source file:org.apache.drill.exec.store.LocalSyncableFileSystem.java
License:Apache License
@Override public FileStatus getFileStatus(Path path) throws IOException { File file = new File(Path.getPathWithoutSchemeAndAuthority(path).toString()); return new FileStatus(file.length(), file.isDirectory(), 1, 0, file.lastModified(), path); }
From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java
License:Apache License
/** * Get the metadata for a single file/*www.j ava2s.c o m*/ */ private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, final FileStatus file, final FileSystem fs, boolean allColumns, Set<String> columnSet) throws IOException, InterruptedException { final ParquetMetadata metadata; final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI(); final Configuration conf = new Configuration(fs.getConf()); try { metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>) () -> { try (ParquetFileReader parquetFileReader = ParquetFileReader .open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) { return parquetFileReader.getFooter(); } }); } catch (Exception e) { logger.error( "Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e); throw e; } MessageType schema = metadata.getFileMetaData().getSchema(); Map<SchemaPath, ColTypeInfo> colTypeInfoMap = new HashMap<>(); schema.getPaths(); for (String[] path : schema.getPaths()) { colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0)); } List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList(); ArrayList<SchemaPath> ALL_COLS = new ArrayList<>(); ALL_COLS.add(SchemaPath.STAR_COLUMN); ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility .detectCorruptDates(metadata, ALL_COLS, readerConfig.autoCorrectCorruptedDates()); logger.debug("Contains corrupt dates: {}.", containsCorruptDates); for (BlockMetaData rowGroup : metadata.getBlocks()) { List<ColumnMetadata_v3> columnMetadataList = new ArrayList<>(); long length = 0; for (ColumnChunkMetaData col : rowGroup.getColumns()) { String[] columnName = col.getPath().toArray(); SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName); ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName); ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getPrimitiveType().getPrimitiveTypeName(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel); if (parquetTableMetadata.columnTypeInfo == null) { parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>(); } parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata); // Store column metadata only if allColumns is set to true or if the column belongs to the subset of columns specified in the refresh command if (allColumns || columnSet == null || !allColumns && columnSet != null && columnSet.size() > 0 && columnSet.contains(columnSchemaName.getRootSegmentPath())) { Statistics<?> stats = col.getStatistics(); // Save the column schema info. We'll merge it into one list Object minValue = null; Object maxValue = null; long numNulls = -1; boolean statsAvailable = stats != null && !stats.isEmpty(); if (statsAvailable) { if (stats.hasNonNullValue()) { minValue = stats.genericGetMin(); maxValue = stats.genericGetMax(); if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) { minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue); maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue); } } numNulls = stats.getNumNulls(); } ColumnMetadata_v3 columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getPrimitiveType().getPrimitiveTypeName(), minValue, maxValue, numNulls); columnMetadataList.add(columnMetadata); } length += col.getTotalSize(); } // DRILL-5009: Skip the RowGroup if it is empty // Note we still read the schema even if there are no values in the RowGroup if (rowGroup.getRowCount() == 0) { continue; } RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, fs, rowGroup.getStartingPos(), length), columnMetadataList); rowGroupMetadataList.add(rowGroupMeta); } Path path = Path.getPathWithoutSchemeAndAuthority(file.getPath()); return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList); }
From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java
License:Apache License
/** * Read the parquet metadata from a file * * @param path to metadata file/*from w w w. j a v a 2s. c o m*/ * @param dirsOnly true for {@link Metadata#METADATA_DIRECTORIES_FILENAME} * or false for {@link Metadata#METADATA_FILENAME} files reading * @param metaContext current metadata context */ private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext metaContext, FileSystem fs) { Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null; Path metadataParentDir = Path.getPathWithoutSchemeAndAuthority(path.getParent()); String metadataParentDirPath = metadataParentDir.toUri().getPath(); ObjectMapper mapper = new ObjectMapper(); final SimpleModule serialModule = new SimpleModule(); serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De()); serialModule.addKeyDeserializer(Metadata_V2.ColumnTypeMetadata_v2.Key.class, new Metadata_V2.ColumnTypeMetadata_v2.Key.DeSerializer()); serialModule.addKeyDeserializer(ColumnTypeMetadata_v3.Key.class, new ColumnTypeMetadata_v3.Key.DeSerializer()); AfterburnerModule module = new AfterburnerModule(); module.setUseOptimizedBeanDeserializer(true); mapper.registerModule(serialModule); mapper.registerModule(module); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); try (InputStream is = fs.open(path)) { boolean alreadyCheckedModification; boolean newMetadata = false; alreadyCheckedModification = metaContext.getStatus(metadataParentDirPath); if (dirsOnly) { parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class); if (timer != null) { logger.debug("Took {} ms to read directories from directory cache file", timer.elapsed(TimeUnit.MILLISECONDS)); timer.stop(); } parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath); if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), path, metadataParentDir, metaContext, fs)) { parquetTableMetadataDirs = (createMetaFilesRecursively( Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null)).getRight(); newMetadata = true; } } else { parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class); if (timer != null) { logger.debug("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS)); timer.stop(); } if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()) .compareTo(new MetadataVersion(3, 0)) >= 0) { ((ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath); } if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, metaContext, fs)) { // TODO change with current columns in existing metadata (auto refresh feature) parquetTableMetadata = (createMetaFilesRecursively( Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null)).getLeft(); newMetadata = true; } // DRILL-5009: Remove the RowGroup if it is empty List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles(); for (ParquetFileMetadata file : files) { List<? extends RowGroupMetadata> rowGroups = file.getRowGroups(); rowGroups.removeIf(r -> r.getRowCount() == 0); } } if (newMetadata) { // if new metadata files were created, invalidate the existing metadata context metaContext.clear(); } } catch (IOException e) { logger.error("Failed to read '{}' metadata file", path, e); metaContext.setMetadataCacheCorrupted(true); } }
From source file:org.apache.drill.exec.store.parquet.metadata.MetadataPathUtils.java
License:Apache License
/** * Constructs relative path from child full path and base path. Or return child path if the last one is already relative * * @param childPath full absolute path/*ww w. j a va2 s .c o m*/ * @param baseDir base path (the part of the Path, which should be cut off from child path) * @return relative path */ public static Path relativize(Path baseDir, Path childPath) { Path fullPathWithoutSchemeAndAuthority = Path.getPathWithoutSchemeAndAuthority(childPath); Path basePathWithoutSchemeAndAuthority = Path.getPathWithoutSchemeAndAuthority(baseDir); // Since hadoop Path hasn't relativize() we use uri.relativize() to get relative path Path relativeFilePath = new Path( basePathWithoutSchemeAndAuthority.toUri().relativize(fullPathWithoutSchemeAndAuthority.toUri())); if (relativeFilePath.isAbsolute()) { throw new IllegalStateException(String.format("Path %s is not a subpath of %s.", basePathWithoutSchemeAndAuthority.toUri().getPath(), fullPathWithoutSchemeAndAuthority.toUri().getPath())); } return relativeFilePath; }
From source file:org.apache.drill.exec.store.parquet.Metadata.java
License:Apache License
/** * Get the metadata for a single file//from w w w . j a v a 2 s . co m * * @param file * @return * @throws IOException */ private ParquetFileMetadata_v2 getParquetFileMetadata_v2(ParquetTableMetadata_v2 parquetTableMetadata, FileStatus file) throws IOException { ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file); MessageType schema = metadata.getFileMetaData().getSchema(); Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap(); schema.getPaths(); for (String[] path : schema.getPaths()) { originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0)); } List<RowGroupMetadata_v2> rowGroupMetadataList = Lists.newArrayList(); for (BlockMetaData rowGroup : metadata.getBlocks()) { List<ColumnMetadata_v2> columnMetadataList = Lists.newArrayList(); long length = 0; for (ColumnChunkMetaData col : rowGroup.getColumns()) { ColumnMetadata_v2 columnMetadata; boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()); Statistics<?> stats = col.getStatistics(); String[] columnName = col.getPath().toArray(); SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName); ColumnTypeMetadata_v2 columnTypeMetadata = new ColumnTypeMetadata_v2(columnName, col.getType(), originalTypeMap.get(columnSchemaName)); if (parquetTableMetadata.columnTypeInfo == null) { parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>(); } // Save the column schema info. We'll merge it into one list parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v2.Key(columnTypeMetadata.name), columnTypeMetadata); if (statsAvailable) { // Write stats only if minVal==maxVal. Also, we then store only maxVal Object mxValue = null; if (stats.genericGetMax() != null && stats.genericGetMin() != null && stats.genericGetMax().equals(stats.genericGetMin())) { mxValue = stats.genericGetMax(); } columnMetadata = new ColumnMetadata_v2(columnTypeMetadata.name, col.getType(), mxValue, stats.getNumNulls()); } else { columnMetadata = new ColumnMetadata_v2(columnTypeMetadata.name, col.getType(), null, null); } columnMetadataList.add(columnMetadata); length += col.getTotalSize(); } RowGroupMetadata_v2 rowGroupMeta = new RowGroupMetadata_v2(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList); rowGroupMetadataList.add(rowGroupMeta); } String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString(); return new ParquetFileMetadata_v2(path, file.getLen(), rowGroupMetadataList); }
From source file:org.apache.drill.exec.store.parquet.Metadata.java
License:Apache License
/** * Read the parquet metadata from a file * * @param path/* w ww .jav a 2 s . c om*/ * @return * @throws IOException */ private void readBlockMeta(String path, boolean dirsOnly, MetadataContext metaContext) throws IOException { Stopwatch timer = Stopwatch.createStarted(); Path p = new Path(path); Path parentDir = p.getParent(); // parent directory of the metadata file ObjectMapper mapper = new ObjectMapper(); final SimpleModule serialModule = new SimpleModule(); serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De()); serialModule.addKeyDeserializer(ColumnTypeMetadata_v2.Key.class, new ColumnTypeMetadata_v2.Key.DeSerializer()); AfterburnerModule module = new AfterburnerModule(); module.setUseOptimizedBeanDeserializer(true); mapper.registerModule(serialModule); mapper.registerModule(module); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); FSDataInputStream is = fs.open(p); boolean alreadyCheckedModification = false; boolean newMetadata = false; if (metaContext != null) { alreadyCheckedModification = metaContext.getStatus(parentDir.toString()); } if (dirsOnly) { parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class); logger.info("Took {} ms to read directories from directory cache file", timer.elapsed(TimeUnit.MILLISECONDS)); timer.stop(); if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), p, parentDir, metaContext)) { parquetTableMetadataDirs = (createMetaFilesRecursively( Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getRight(); newMetadata = true; } } else { parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class); logger.info("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS)); timer.stop(); if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), p, parentDir, metaContext)) { parquetTableMetadata = (createMetaFilesRecursively( Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getLeft(); newMetadata = true; } } if (newMetadata && metaContext != null) { // if new metadata files were created, invalidate the existing metadata context metaContext.clear(); } }
From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java
License:Apache License
public void populatePruningVector(ValueVector v, int index, SchemaPath column, String file) { String f = Path.getPathWithoutSchemeAndAuthority(new Path(file)).toString(); MinorType type = getTypeForColumn(column).getMinorType(); switch (type) { case INT: {/*from ww w .j a va 2 s.c om*/ NullableIntVector intVector = (NullableIntVector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); intVector.getMutator().setSafe(index, value); return; } case SMALLINT: { NullableSmallIntVector smallIntVector = (NullableSmallIntVector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); smallIntVector.getMutator().setSafe(index, value.shortValue()); return; } case TINYINT: { NullableTinyIntVector tinyIntVector = (NullableTinyIntVector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); tinyIntVector.getMutator().setSafe(index, value.byteValue()); return; } case UINT1: { NullableUInt1Vector intVector = (NullableUInt1Vector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); intVector.getMutator().setSafe(index, value.byteValue()); return; } case UINT2: { NullableUInt2Vector intVector = (NullableUInt2Vector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); intVector.getMutator().setSafe(index, (char) value.shortValue()); return; } case UINT4: { NullableUInt4Vector intVector = (NullableUInt4Vector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); intVector.getMutator().setSafe(index, value); return; } case BIGINT: { NullableBigIntVector bigIntVector = (NullableBigIntVector) v; Long value = (Long) partitionValueMap.get(f).get(column); bigIntVector.getMutator().setSafe(index, value); return; } case FLOAT4: { NullableFloat4Vector float4Vector = (NullableFloat4Vector) v; Float value = (Float) partitionValueMap.get(f).get(column); float4Vector.getMutator().setSafe(index, value); return; } case FLOAT8: { NullableFloat8Vector float8Vector = (NullableFloat8Vector) v; Double value = (Double) partitionValueMap.get(f).get(column); float8Vector.getMutator().setSafe(index, value); return; } case VARBINARY: { NullableVarBinaryVector varBinaryVector = (NullableVarBinaryVector) v; Object s = partitionValueMap.get(f).get(column); byte[] bytes; if (s instanceof Binary) { bytes = ((Binary) s).getBytes(); } else if (s instanceof String) { bytes = ((String) s).getBytes(); } else if (s instanceof byte[]) { bytes = (byte[]) s; } else { throw new UnsupportedOperationException("Unable to create column data for type: " + type); } varBinaryVector.getMutator().setSafe(index, bytes, 0, bytes.length); return; } case DECIMAL18: { NullableDecimal18Vector decimalVector = (NullableDecimal18Vector) v; Long value = (Long) partitionValueMap.get(f).get(column); decimalVector.getMutator().setSafe(index, value); return; } case DATE: { NullableDateVector dateVector = (NullableDateVector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); dateVector.getMutator().setSafe(index, DateTimeUtils.fromJulianDay(value - ParquetOutputRecordWriter.JULIAN_DAY_EPOC - 0.5)); return; } case TIME: { NullableTimeVector timeVector = (NullableTimeVector) v; Integer value = (Integer) partitionValueMap.get(f).get(column); timeVector.getMutator().setSafe(index, value); return; } case TIMESTAMP: { NullableTimeStampVector timeStampVector = (NullableTimeStampVector) v; Long value = (Long) partitionValueMap.get(f).get(column); timeStampVector.getMutator().setSafe(index, value); return; } case VARCHAR: { NullableVarCharVector varCharVector = (NullableVarCharVector) v; Object s = partitionValueMap.get(f).get(column); byte[] bytes; if (s instanceof String) { // if the metadata was read from a JSON cache file it maybe a string type bytes = ((String) s).getBytes(); } else if (s instanceof Binary) { bytes = ((Binary) s).getBytes(); } else if (s instanceof byte[]) { bytes = (byte[]) s; } else { throw new UnsupportedOperationException("Unable to create column data for type: " + type); } varCharVector.getMutator().setSafe(index, bytes, 0, bytes.length); return; } default: throw new UnsupportedOperationException("Unsupported type: " + type); } }
From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java
License:Apache License
/** * Create and return a new file selection based on reading the metadata cache file. * * This function also initializes a few of ParquetGroupScan's fields as appropriate. * * @param selection initial file selection * @param metaFilePath metadata cache file path * @return file selection read from cache * * @throws IOException//from ww w . ja va2 s . co m * @throws UserException when the updated selection is empty, this happens if the user selects an empty folder. */ private FileSelection initFromMetadataCache(FileSelection selection, Path metaFilePath) throws IOException { // get the metadata for the root directory by reading the metadata file // parquetTableMetadata contains the metadata for all files in the selection root folder, but we need to make sure // we only select the files that are part of selection (by setting fileSet appropriately) // get (and set internal field) the metadata for the directory by reading the metadata file this.parquetTableMetadata = Metadata.readBlockMeta(fs, metaFilePath.toString(), selection.getMetaContext()); List<FileStatus> fileStatuses = selection.getStatuses(fs); if (fileSet == null) { fileSet = Sets.newHashSet(); } final Path first = fileStatuses.get(0).getPath(); if (fileStatuses.size() == 1 && selection.getSelectionRoot().equals(first.toString())) { // we are selecting all files from selection root. Expand the file list from the cache for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) { fileSet.add(file.getPath()); } } else if (selection.isExpandedPartial() && !selection.hadWildcard() && cacheFileRoot != null) { if (selection.wasAllPartitionsPruned()) { // if all partitions were previously pruned, we only need to read 1 file (for the schema) fileSet.add(this.parquetTableMetadata.getFiles().get(0).getPath()); } else { // we are here if the selection is in the expanded_partial state (i.e it has directories). We get the // list of files from the metadata cache file that is present in the cacheFileRoot directory and populate // the fileSet. However, this is *not* the final list of files that will be scanned in execution since the // second phase of partition pruning will apply on the files and modify the file selection appropriately. for (Metadata.ParquetFileMetadata file : this.parquetTableMetadata.getFiles()) { fileSet.add(file.getPath()); } } } else { // we need to expand the files from fileStatuses for (FileStatus status : fileStatuses) { if (status.isDirectory()) { //TODO [DRILL-4496] read the metadata cache files in parallel final Path metaPath = new Path(status.getPath(), Metadata.METADATA_FILENAME); final Metadata.ParquetTableMetadataBase metadata = Metadata.readBlockMeta(fs, metaPath.toString(), selection.getMetaContext()); for (Metadata.ParquetFileMetadata file : metadata.getFiles()) { fileSet.add(file.getPath()); } } else { final Path path = Path.getPathWithoutSchemeAndAuthority(status.getPath()); fileSet.add(path.toString()); } } } if (fileSet.isEmpty()) { // no files were found, most likely we tried to query some empty sub folders throw UserException.validationError().message("The table you tried to query is empty").build(logger); } List<String> fileNames = Lists.newArrayList(fileSet); // when creating the file selection, set the selection root without the URI prefix // The reason is that the file names above have been created in the form // /a/b/c.parquet and the format of the selection root must match that of the file names // otherwise downstream operations such as partition pruning can break. final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(new Path(selection.getSelectionRoot())); this.selectionRoot = metaRootPath.toString(); // Use the FileSelection constructor directly here instead of the FileSelection.create() method // because create() changes the root to include the scheme and authority; In future, if create() // is the preferred way to instantiate a file selection, we may need to do something different... // WARNING: file statuses and file names are inconsistent FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames, metaRootPath.toString(), cacheFileRoot, selection.wasAllPartitionsPruned()); newSelection.setExpandedFully(); newSelection.setMetaContext(selection.getMetaContext()); return newSelection; }
From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java
License:Apache License
private void init(MetadataContext metaContext) throws IOException { if (entries.size() == 1 && parquetTableMetadata == null) { Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath())); Path metaPath = null;/*from w ww.j av a 2s. c o m*/ if (fs.isDirectory(p)) { // Using the metadata file makes sense when querying a directory; otherwise // if querying a single file we can look up the metadata directly from the file metaPath = new Path(p, Metadata.METADATA_FILENAME); } if (metaPath != null && fs.exists(metaPath)) { usedMetadataCache = true; parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext); } else { parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString()); } } else { Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot)); Path metaPath = new Path(p, Metadata.METADATA_FILENAME); if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) { usedMetadataCache = true; if (parquetTableMetadata == null) { parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext); } if (fileSet != null) { parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata); } } else { final List<FileStatus> fileStatuses = Lists.newArrayList(); for (ReadEntryWithPath entry : entries) { getFiles(entry.getPath(), fileStatuses); } parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses); } } if (fileSet == null) { fileSet = Sets.newHashSet(); for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) { fileSet.add(file.getPath()); } } Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap(); for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) { hostEndpointMap.put(endpoint.getAddress(), endpoint); } rowGroupInfos = Lists.newArrayList(); for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) { int rgIndex = 0; for (RowGroupMetadata rg : file.getRowGroups()) { RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex, rg.getRowCount()); EndpointByteMap endpointByteMap = new EndpointByteMapImpl(); for (String host : rg.getHostAffinity().keySet()) { if (hostEndpointMap.containsKey(host)) { endpointByteMap.add(hostEndpointMap.get(host), (long) (rg.getHostAffinity().get(host) * rg.getLength())); } } rowGroupInfo.setEndpointByteMap(endpointByteMap); rgIndex++; rowGroupInfos.add(rowGroupInfo); } } this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos); columnValueCounts = Maps.newHashMap(); this.rowCount = 0; boolean first = true; for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) { for (RowGroupMetadata rowGroup : file.getRowGroups()) { long rowCount = rowGroup.getRowCount(); for (ColumnMetadata column : rowGroup.getColumns()) { SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName()); Long previousCount = columnValueCounts.get(schemaPath); if (previousCount != null) { if (previousCount != GroupScan.NO_COLUMN_STATS) { if (column.getNulls() != null) { Long newCount = rowCount - column.getNulls(); columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount); } } } else { if (column.getNulls() != null) { Long newCount = rowCount - column.getNulls(); columnValueCounts.put(schemaPath, newCount); } else { columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS); } } boolean partitionColumn = checkForPartitionColumn(column, first); if (partitionColumn) { Map<SchemaPath, Object> map = partitionValueMap.get(file.getPath()); if (map == null) { map = Maps.newHashMap(); partitionValueMap.put(file.getPath(), map); } Object value = map.get(schemaPath); Object currentValue = column.getMaxValue(); if (value != null) { if (value != currentValue) { columnTypeMap.remove(schemaPath); } } else { map.put(schemaPath, currentValue); } } else { columnTypeMap.remove(schemaPath); } } this.rowCount += rowGroup.getRowCount(); first = false; } } }