Example usage for org.apache.hadoop.fs Path getPathWithoutSchemeAndAuthority

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getPathWithoutSchemeAndAuthority.

Prototype

public static Path getPathWithoutSchemeAndAuthority(Path path)

Source Link

Document

Return a version of the given Path without the scheme information.

Usage

From source file:org.apache.drill.exec.store.ImplicitColumnExplorer.java

License:Apache License

/**
 * Compares selection root and actual file path to determine partition columns values.
 * Adds implicit file columns according to columns list.
 *
 * @return map with columns names as keys and their values
 *///from  ww w .j  a  v a 2s  .  com
public Map<String, String> populateImplicitColumns(FileWork work, String selectionRoot) {
    Map<String, String> implicitValues = Maps.newLinkedHashMap();
    if (selectionRoot != null) {
        String[] r = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot)).toString().split("/");
        Path path = Path.getPathWithoutSchemeAndAuthority(new Path(work.getPath()));
        String[] p = path.toString().split("/");
        if (p.length > r.length) {
            String[] q = ArrayUtils.subarray(p, r.length, p.length - 1);
            for (int a = 0; a < q.length; a++) {
                if (isStarQuery || selectedPartitionColumns.contains(a)) {
                    implicitValues.put(partitionDesignator + a, q[a]);
                }
            }
        }
        //add implicit file columns
        for (Map.Entry<String, ImplicitFileColumns> entry : selectedImplicitColumns.entrySet()) {
            implicitValues.put(entry.getKey(), entry.getValue().getValue(path));
        }
    }
    return implicitValues;
}

From source file:org.apache.drill.exec.store.LocalSyncableFileSystem.java

License:Apache License

@Override
public FileStatus getFileStatus(Path path) throws IOException {
    File file = new File(Path.getPathWithoutSchemeAndAuthority(path).toString());
    return new FileStatus(file.length(), file.isDirectory(), 1, 0, file.lastModified(), path);
}

From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java

License:Apache License

/**
 * Get the metadata for a single file/*www.j ava2s.c  o  m*/
 */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata,
        final FileStatus file, final FileSystem fs, boolean allColumns, Set<String> columnSet)
        throws IOException, InterruptedException {
    final ParquetMetadata metadata;
    final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
    final Configuration conf = new Configuration(fs.getConf());
    try {
        metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>) () -> {
            try (ParquetFileReader parquetFileReader = ParquetFileReader
                    .open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) {
                return parquetFileReader.getFooter();
            }
        });
    } catch (Exception e) {
        logger.error(
                "Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}",
                file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e);
        throw e;
    }

    MessageType schema = metadata.getFileMetaData().getSchema();

    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = new HashMap<>();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }

    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();

    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(SchemaPath.STAR_COLUMN);
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility
            .detectCorruptDates(metadata, ALL_COLS, readerConfig.autoCorrectCorruptedDates());
    logger.debug("Contains corrupt dates: {}.", containsCorruptDates);

    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = new ArrayList<>();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);

            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName,
                    col.getPrimitiveType().getPrimitiveTypeName(), colTypeInfo.originalType,
                    colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel,
                    colTypeInfo.definitionLevel);

            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name),
                    columnTypeMetadata);
            // Store column metadata only if allColumns is set to true or if the column belongs to the subset of columns specified in the refresh command
            if (allColumns || columnSet == null || !allColumns && columnSet != null && columnSet.size() > 0
                    && columnSet.contains(columnSchemaName.getRootSegmentPath())) {
                Statistics<?> stats = col.getStatistics();
                // Save the column schema info. We'll merge it into one list
                Object minValue = null;
                Object maxValue = null;
                long numNulls = -1;
                boolean statsAvailable = stats != null && !stats.isEmpty();
                if (statsAvailable) {
                    if (stats.hasNonNullValue()) {
                        minValue = stats.genericGetMin();
                        maxValue = stats.genericGetMax();
                        if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
                                && columnTypeMetadata.originalType == OriginalType.DATE) {
                            minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                            maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                        }
                    }
                    numNulls = stats.getNumNulls();
                }
                ColumnMetadata_v3 columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name,
                        col.getPrimitiveType().getPrimitiveTypeName(), minValue, maxValue, numNulls);
                columnMetadataList.add(columnMetadata);
            }
            length += col.getTotalSize();
        }

        // DRILL-5009: Skip the RowGroup if it is empty
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length,
                rowGroup.getRowCount(), getHostAffinity(file, fs, rowGroup.getStartingPos(), length),
                columnMetadataList);

        rowGroupMetadataList.add(rowGroupMeta);
    }
    Path path = Path.getPathWithoutSchemeAndAuthority(file.getPath());

    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}

From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java

License:Apache License

/**
 * Read the parquet metadata from a file
 *
 * @param path to metadata file/*from  w  w w. j a v a  2s.  c  o m*/
 * @param dirsOnly true for {@link Metadata#METADATA_DIRECTORIES_FILENAME}
 *                 or false for {@link Metadata#METADATA_FILENAME} files reading
 * @param metaContext current metadata context
 */
private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext metaContext, FileSystem fs) {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    Path metadataParentDir = Path.getPathWithoutSchemeAndAuthority(path.getParent());
    String metadataParentDirPath = metadataParentDir.toUri().getPath();
    ObjectMapper mapper = new ObjectMapper();

    final SimpleModule serialModule = new SimpleModule();
    serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
    serialModule.addKeyDeserializer(Metadata_V2.ColumnTypeMetadata_v2.Key.class,
            new Metadata_V2.ColumnTypeMetadata_v2.Key.DeSerializer());
    serialModule.addKeyDeserializer(ColumnTypeMetadata_v3.Key.class,
            new ColumnTypeMetadata_v3.Key.DeSerializer());

    AfterburnerModule module = new AfterburnerModule();
    module.setUseOptimizedBeanDeserializer(true);

    mapper.registerModule(serialModule);
    mapper.registerModule(module);
    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    try (InputStream is = fs.open(path)) {
        boolean alreadyCheckedModification;
        boolean newMetadata = false;
        alreadyCheckedModification = metaContext.getStatus(metadataParentDirPath);

        if (dirsOnly) {
            parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
            if (timer != null) {
                logger.debug("Took {} ms to read directories from directory cache file",
                        timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
            if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), path,
                    metadataParentDir, metaContext, fs)) {
                parquetTableMetadataDirs = (createMetaFilesRecursively(
                        Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null)).getRight();
                newMetadata = true;
            }
        } else {
            parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
            if (timer != null) {
                logger.debug("Took {} ms to read metadata from cache file",
                        timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            if (new MetadataVersion(parquetTableMetadata.getMetadataVersion())
                    .compareTo(new MetadataVersion(3, 0)) >= 0) {
                ((ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
            }
            if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), path,
                    metadataParentDir, metaContext, fs)) {
                // TODO change with current columns in existing metadata (auto refresh feature)
                parquetTableMetadata = (createMetaFilesRecursively(
                        Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null)).getLeft();
                newMetadata = true;
            }

            // DRILL-5009: Remove the RowGroup if it is empty
            List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
            for (ParquetFileMetadata file : files) {
                List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
                rowGroups.removeIf(r -> r.getRowCount() == 0);
            }

        }
        if (newMetadata) {
            // if new metadata files were created, invalidate the existing metadata context
            metaContext.clear();
        }
    } catch (IOException e) {
        logger.error("Failed to read '{}' metadata file", path, e);
        metaContext.setMetadataCacheCorrupted(true);
    }
}

From source file:org.apache.drill.exec.store.parquet.metadata.MetadataPathUtils.java

License:Apache License

/**
 * Constructs relative path from child full path and base path. Or return child path if the last one is already relative
 *
 * @param childPath full absolute path/*ww  w. j a  va2 s .c o m*/
 * @param baseDir base path (the part of the Path, which should be cut off from child path)
 * @return relative path
 */
public static Path relativize(Path baseDir, Path childPath) {
    Path fullPathWithoutSchemeAndAuthority = Path.getPathWithoutSchemeAndAuthority(childPath);
    Path basePathWithoutSchemeAndAuthority = Path.getPathWithoutSchemeAndAuthority(baseDir);

    // Since hadoop Path hasn't relativize() we use uri.relativize() to get relative path
    Path relativeFilePath = new Path(
            basePathWithoutSchemeAndAuthority.toUri().relativize(fullPathWithoutSchemeAndAuthority.toUri()));
    if (relativeFilePath.isAbsolute()) {
        throw new IllegalStateException(String.format("Path %s is not a subpath of %s.",
                basePathWithoutSchemeAndAuthority.toUri().getPath(),
                fullPathWithoutSchemeAndAuthority.toUri().getPath()));
    }
    return relativeFilePath;
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Get the metadata for a single file//from w w  w  . j  a v a  2  s . co  m
 *
 * @param file
 * @return
 * @throws IOException
 */
private ParquetFileMetadata_v2 getParquetFileMetadata_v2(ParquetTableMetadata_v2 parquetTableMetadata,
        FileStatus file) throws IOException {
    ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
    MessageType schema = metadata.getFileMetaData().getSchema();

    Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
    }

    List<RowGroupMetadata_v2> rowGroupMetadataList = Lists.newArrayList();

    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v2> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v2 columnMetadata;

            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());

            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColumnTypeMetadata_v2 columnTypeMetadata = new ColumnTypeMetadata_v2(columnName, col.getType(),
                    originalTypeMap.get(columnSchemaName));
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v2.Key(columnTypeMetadata.name),
                    columnTypeMetadata);
            if (statsAvailable) {
                // Write stats only if minVal==maxVal. Also, we then store only maxVal
                Object mxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null
                        && stats.genericGetMax().equals(stats.genericGetMin())) {
                    mxValue = stats.genericGetMax();
                }
                columnMetadata = new ColumnMetadata_v2(columnTypeMetadata.name, col.getType(), mxValue,
                        stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v2(columnTypeMetadata.name, col.getType(), null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }

        RowGroupMetadata_v2 rowGroupMeta = new RowGroupMetadata_v2(rowGroup.getStartingPos(), length,
                rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length),
                columnMetadataList);

        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();

    return new ParquetFileMetadata_v2(path, file.getLen(), rowGroupMetadataList);
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Read the parquet metadata from a file
 *
 * @param path/* w  ww .jav a  2 s .  c om*/
 * @return
 * @throws IOException
 */
private void readBlockMeta(String path, boolean dirsOnly, MetadataContext metaContext) throws IOException {
    Stopwatch timer = Stopwatch.createStarted();
    Path p = new Path(path);
    Path parentDir = p.getParent(); // parent directory of the metadata file
    ObjectMapper mapper = new ObjectMapper();

    final SimpleModule serialModule = new SimpleModule();
    serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
    serialModule.addKeyDeserializer(ColumnTypeMetadata_v2.Key.class,
            new ColumnTypeMetadata_v2.Key.DeSerializer());

    AfterburnerModule module = new AfterburnerModule();
    module.setUseOptimizedBeanDeserializer(true);

    mapper.registerModule(serialModule);
    mapper.registerModule(module);
    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    FSDataInputStream is = fs.open(p);

    boolean alreadyCheckedModification = false;
    boolean newMetadata = false;

    if (metaContext != null) {
        alreadyCheckedModification = metaContext.getStatus(parentDir.toString());
    }

    if (dirsOnly) {
        parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
        logger.info("Took {} ms to read directories from directory cache file",
                timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
        if (!alreadyCheckedModification
                && tableModified(parquetTableMetadataDirs.getDirectories(), p, parentDir, metaContext)) {
            parquetTableMetadataDirs = (createMetaFilesRecursively(
                    Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getRight();
            newMetadata = true;
        }
    } else {
        parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
        logger.info("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
        if (!alreadyCheckedModification
                && tableModified(parquetTableMetadata.getDirectories(), p, parentDir, metaContext)) {
            parquetTableMetadata = (createMetaFilesRecursively(
                    Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getLeft();
            newMetadata = true;
        }
    }

    if (newMetadata && metaContext != null) {
        // if new metadata files were created, invalidate the existing metadata context
        metaContext.clear();
    }

}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

public void populatePruningVector(ValueVector v, int index, SchemaPath column, String file) {
    String f = Path.getPathWithoutSchemeAndAuthority(new Path(file)).toString();
    MinorType type = getTypeForColumn(column).getMinorType();
    switch (type) {
    case INT: {/*from ww  w .j  a  va 2  s.c om*/
        NullableIntVector intVector = (NullableIntVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, value);
        return;
    }
    case SMALLINT: {
        NullableSmallIntVector smallIntVector = (NullableSmallIntVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        smallIntVector.getMutator().setSafe(index, value.shortValue());
        return;
    }
    case TINYINT: {
        NullableTinyIntVector tinyIntVector = (NullableTinyIntVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        tinyIntVector.getMutator().setSafe(index, value.byteValue());
        return;
    }
    case UINT1: {
        NullableUInt1Vector intVector = (NullableUInt1Vector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, value.byteValue());
        return;
    }
    case UINT2: {
        NullableUInt2Vector intVector = (NullableUInt2Vector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, (char) value.shortValue());
        return;
    }
    case UINT4: {
        NullableUInt4Vector intVector = (NullableUInt4Vector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, value);
        return;
    }
    case BIGINT: {
        NullableBigIntVector bigIntVector = (NullableBigIntVector) v;
        Long value = (Long) partitionValueMap.get(f).get(column);
        bigIntVector.getMutator().setSafe(index, value);
        return;
    }
    case FLOAT4: {
        NullableFloat4Vector float4Vector = (NullableFloat4Vector) v;
        Float value = (Float) partitionValueMap.get(f).get(column);
        float4Vector.getMutator().setSafe(index, value);
        return;
    }
    case FLOAT8: {
        NullableFloat8Vector float8Vector = (NullableFloat8Vector) v;
        Double value = (Double) partitionValueMap.get(f).get(column);
        float8Vector.getMutator().setSafe(index, value);
        return;
    }
    case VARBINARY: {
        NullableVarBinaryVector varBinaryVector = (NullableVarBinaryVector) v;
        Object s = partitionValueMap.get(f).get(column);
        byte[] bytes;
        if (s instanceof Binary) {
            bytes = ((Binary) s).getBytes();
        } else if (s instanceof String) {
            bytes = ((String) s).getBytes();
        } else if (s instanceof byte[]) {
            bytes = (byte[]) s;
        } else {
            throw new UnsupportedOperationException("Unable to create column data for type: " + type);
        }
        varBinaryVector.getMutator().setSafe(index, bytes, 0, bytes.length);
        return;
    }
    case DECIMAL18: {
        NullableDecimal18Vector decimalVector = (NullableDecimal18Vector) v;
        Long value = (Long) partitionValueMap.get(f).get(column);
        decimalVector.getMutator().setSafe(index, value);
        return;
    }
    case DATE: {
        NullableDateVector dateVector = (NullableDateVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        dateVector.getMutator().setSafe(index,
                DateTimeUtils.fromJulianDay(value - ParquetOutputRecordWriter.JULIAN_DAY_EPOC - 0.5));
        return;
    }
    case TIME: {
        NullableTimeVector timeVector = (NullableTimeVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        timeVector.getMutator().setSafe(index, value);
        return;
    }
    case TIMESTAMP: {
        NullableTimeStampVector timeStampVector = (NullableTimeStampVector) v;
        Long value = (Long) partitionValueMap.get(f).get(column);
        timeStampVector.getMutator().setSafe(index, value);
        return;
    }
    case VARCHAR: {
        NullableVarCharVector varCharVector = (NullableVarCharVector) v;
        Object s = partitionValueMap.get(f).get(column);
        byte[] bytes;
        if (s instanceof String) { // if the metadata was read from a JSON cache file it maybe a string type
            bytes = ((String) s).getBytes();
        } else if (s instanceof Binary) {
            bytes = ((Binary) s).getBytes();
        } else if (s instanceof byte[]) {
            bytes = (byte[]) s;
        } else {
            throw new UnsupportedOperationException("Unable to create column data for type: " + type);
        }
        varCharVector.getMutator().setSafe(index, bytes, 0, bytes.length);
        return;
    }
    default:
        throw new UnsupportedOperationException("Unsupported type: " + type);
    }
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

/**
 * Create and return a new file selection based on reading the metadata cache file.
 *
 * This function also initializes a few of ParquetGroupScan's fields as appropriate.
 *
 * @param selection initial file selection
 * @param metaFilePath metadata cache file path
 * @return file selection read from cache
 *
 * @throws IOException//from   ww w . ja va2 s . co m
 * @throws UserException when the updated selection is empty, this happens if the user selects an empty folder.
 */
private FileSelection initFromMetadataCache(FileSelection selection, Path metaFilePath) throws IOException {
    // get the metadata for the root directory by reading the metadata file
    // parquetTableMetadata contains the metadata for all files in the selection root folder, but we need to make sure
    // we only select the files that are part of selection (by setting fileSet appropriately)

    // get (and set internal field) the metadata for the directory by reading the metadata file
    this.parquetTableMetadata = Metadata.readBlockMeta(fs, metaFilePath.toString(), selection.getMetaContext());
    List<FileStatus> fileStatuses = selection.getStatuses(fs);

    if (fileSet == null) {
        fileSet = Sets.newHashSet();
    }

    final Path first = fileStatuses.get(0).getPath();
    if (fileStatuses.size() == 1 && selection.getSelectionRoot().equals(first.toString())) {
        // we are selecting all files from selection root. Expand the file list from the cache
        for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }

    } else if (selection.isExpandedPartial() && !selection.hadWildcard() && cacheFileRoot != null) {
        if (selection.wasAllPartitionsPruned()) {
            // if all partitions were previously pruned, we only need to read 1 file (for the schema)
            fileSet.add(this.parquetTableMetadata.getFiles().get(0).getPath());
        } else {
            // we are here if the selection is in the expanded_partial state (i.e it has directories).  We get the
            // list of files from the metadata cache file that is present in the cacheFileRoot directory and populate
            // the fileSet. However, this is *not* the final list of files that will be scanned in execution since the
            // second phase of partition pruning will apply on the files and modify the file selection appropriately.
            for (Metadata.ParquetFileMetadata file : this.parquetTableMetadata.getFiles()) {
                fileSet.add(file.getPath());
            }
        }
    } else {
        // we need to expand the files from fileStatuses
        for (FileStatus status : fileStatuses) {
            if (status.isDirectory()) {
                //TODO [DRILL-4496] read the metadata cache files in parallel
                final Path metaPath = new Path(status.getPath(), Metadata.METADATA_FILENAME);
                final Metadata.ParquetTableMetadataBase metadata = Metadata.readBlockMeta(fs,
                        metaPath.toString(), selection.getMetaContext());
                for (Metadata.ParquetFileMetadata file : metadata.getFiles()) {
                    fileSet.add(file.getPath());
                }
            } else {
                final Path path = Path.getPathWithoutSchemeAndAuthority(status.getPath());
                fileSet.add(path.toString());
            }
        }
    }

    if (fileSet.isEmpty()) {
        // no files were found, most likely we tried to query some empty sub folders
        throw UserException.validationError().message("The table you tried to query is empty").build(logger);
    }

    List<String> fileNames = Lists.newArrayList(fileSet);

    // when creating the file selection, set the selection root without the URI prefix
    // The reason is that the file names above have been created in the form
    // /a/b/c.parquet and the format of the selection root must match that of the file names
    // otherwise downstream operations such as partition pruning can break.
    final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(new Path(selection.getSelectionRoot()));
    this.selectionRoot = metaRootPath.toString();

    // Use the FileSelection constructor directly here instead of the FileSelection.create() method
    // because create() changes the root to include the scheme and authority; In future, if create()
    // is the preferred way to instantiate a file selection, we may need to do something different...
    // WARNING: file statuses and file names are inconsistent
    FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames,
            metaRootPath.toString(), cacheFileRoot, selection.wasAllPartitionsPruned());

    newSelection.setExpandedFully();
    newSelection.setMetaContext(selection.getMetaContext());
    return newSelection;
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

private void init(MetadataContext metaContext) throws IOException {
    if (entries.size() == 1 && parquetTableMetadata == null) {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath()));
        Path metaPath = null;/*from   w  ww.j av  a 2s. c o  m*/
        if (fs.isDirectory(p)) {
            // Using the metadata file makes sense when querying a directory; otherwise
            // if querying a single file we can look up the metadata directly from the file
            metaPath = new Path(p, Metadata.METADATA_FILENAME);
        }
        if (metaPath != null && fs.exists(metaPath)) {
            usedMetadataCache = true;
            parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext);
        } else {
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString());
        }
    } else {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot));
        Path metaPath = new Path(p, Metadata.METADATA_FILENAME);
        if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
            usedMetadataCache = true;
            if (parquetTableMetadata == null) {
                parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext);
            }
            if (fileSet != null) {
                parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
            }
        } else {
            final List<FileStatus> fileStatuses = Lists.newArrayList();
            for (ReadEntryWithPath entry : entries) {
                getFiles(entry.getPath(), fileStatuses);
            }
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses);
        }
    }

    if (fileSet == null) {
        fileSet = Sets.newHashSet();
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }
    }

    Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap();

    for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) {
        hostEndpointMap.put(endpoint.getAddress(), endpoint);
    }

    rowGroupInfos = Lists.newArrayList();
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        int rgIndex = 0;
        for (RowGroupMetadata rg : file.getRowGroups()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex,
                    rg.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rg.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host),
                            (long) (rg.getHostAffinity().get(host) * rg.getLength()));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rgIndex++;
            rowGroupInfos.add(rowGroupInfo);
        }
    }

    this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos);

    columnValueCounts = Maps.newHashMap();
    this.rowCount = 0;
    boolean first = true;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            long rowCount = rowGroup.getRowCount();
            for (ColumnMetadata column : rowGroup.getColumns()) {
                SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName());
                Long previousCount = columnValueCounts.get(schemaPath);
                if (previousCount != null) {
                    if (previousCount != GroupScan.NO_COLUMN_STATS) {
                        if (column.getNulls() != null) {
                            Long newCount = rowCount - column.getNulls();
                            columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount);
                        }
                    }
                } else {
                    if (column.getNulls() != null) {
                        Long newCount = rowCount - column.getNulls();
                        columnValueCounts.put(schemaPath, newCount);
                    } else {
                        columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS);
                    }
                }
                boolean partitionColumn = checkForPartitionColumn(column, first);
                if (partitionColumn) {
                    Map<SchemaPath, Object> map = partitionValueMap.get(file.getPath());
                    if (map == null) {
                        map = Maps.newHashMap();
                        partitionValueMap.put(file.getPath(), map);
                    }
                    Object value = map.get(schemaPath);
                    Object currentValue = column.getMaxValue();
                    if (value != null) {
                        if (value != currentValue) {
                            columnTypeMap.remove(schemaPath);
                        }
                    } else {
                        map.put(schemaPath, currentValue);
                    }
                } else {
                    columnTypeMap.remove(schemaPath);
                }
            }
            this.rowCount += rowGroup.getRowCount();
            first = false;
        }
    }
}