Example usage for org.apache.hadoop.fs Path getPathWithoutSchemeAndAuthority

List of usage examples for org.apache.hadoop.fs Path getPathWithoutSchemeAndAuthority

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getPathWithoutSchemeAndAuthority.

Prototype

public static Path getPathWithoutSchemeAndAuthority(Path path) 

Source Link

Document

Return a version of the given Path without the scheme information.

Usage

From source file:org.apache.drill.exec.store.ImplicitColumnExplorer.java

License:Apache License

/**
 * Compares selection root and actual file path to determine partition columns values.
 * Adds implicit file columns according to columns list.
 *
 * @return map with columns names as keys and their values
 *///from  ww w .j  a  v a 2s  .  com
public Map<String, String> populateImplicitColumns(FileWork work, String selectionRoot) {
    Map<String, String> implicitValues = Maps.newLinkedHashMap();
    if (selectionRoot != null) {
        String[] r = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot)).toString().split("/");
        Path path = Path.getPathWithoutSchemeAndAuthority(new Path(work.getPath()));
        String[] p = path.toString().split("/");
        if (p.length > r.length) {
            String[] q = ArrayUtils.subarray(p, r.length, p.length - 1);
            for (int a = 0; a < q.length; a++) {
                if (isStarQuery || selectedPartitionColumns.contains(a)) {
                    implicitValues.put(partitionDesignator + a, q[a]);
                }
            }
        }
        //add implicit file columns
        for (Map.Entry<String, ImplicitFileColumns> entry : selectedImplicitColumns.entrySet()) {
            implicitValues.put(entry.getKey(), entry.getValue().getValue(path));
        }
    }
    return implicitValues;
}

From source file:org.apache.drill.exec.store.LocalSyncableFileSystem.java

License:Apache License

@Override
public FileStatus getFileStatus(Path path) throws IOException {
    File file = new File(Path.getPathWithoutSchemeAndAuthority(path).toString());
    return new FileStatus(file.length(), file.isDirectory(), 1, 0, file.lastModified(), path);
}

From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java

License:Apache License

/**
 * Get the metadata for a single file/*www.j ava2s.c  o  m*/
 */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata,
        final FileStatus file, final FileSystem fs, boolean allColumns, Set<String> columnSet)
        throws IOException, InterruptedException {
    final ParquetMetadata metadata;
    final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
    final Configuration conf = new Configuration(fs.getConf());
    try {
        metadata = processUserUgi.doAs((PrivilegedExceptionAction<ParquetMetadata>) () -> {
            try (ParquetFileReader parquetFileReader = ParquetFileReader
                    .open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) {
                return parquetFileReader.getFooter();
            }
        });
    } catch (Exception e) {
        logger.error(
                "Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}",
                file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e);
        throw e;
    }

    MessageType schema = metadata.getFileMetaData().getSchema();

    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = new HashMap<>();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }

    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();

    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(SchemaPath.STAR_COLUMN);
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility
            .detectCorruptDates(metadata, ALL_COLS, readerConfig.autoCorrectCorruptedDates());
    logger.debug("Contains corrupt dates: {}.", containsCorruptDates);

    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = new ArrayList<>();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);

            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName,
                    col.getPrimitiveType().getPrimitiveTypeName(), colTypeInfo.originalType,
                    colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel,
                    colTypeInfo.definitionLevel);

            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name),
                    columnTypeMetadata);
            // Store column metadata only if allColumns is set to true or if the column belongs to the subset of columns specified in the refresh command
            if (allColumns || columnSet == null || !allColumns && columnSet != null && columnSet.size() > 0
                    && columnSet.contains(columnSchemaName.getRootSegmentPath())) {
                Statistics<?> stats = col.getStatistics();
                // Save the column schema info. We'll merge it into one list
                Object minValue = null;
                Object maxValue = null;
                long numNulls = -1;
                boolean statsAvailable = stats != null && !stats.isEmpty();
                if (statsAvailable) {
                    if (stats.hasNonNullValue()) {
                        minValue = stats.genericGetMin();
                        maxValue = stats.genericGetMax();
                        if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
                                && columnTypeMetadata.originalType == OriginalType.DATE) {
                            minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                            maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                        }
                    }
                    numNulls = stats.getNumNulls();
                }
                ColumnMetadata_v3 columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name,
                        col.getPrimitiveType().getPrimitiveTypeName(), minValue, maxValue, numNulls);
                columnMetadataList.add(columnMetadata);
            }
            length += col.getTotalSize();
        }

        // DRILL-5009: Skip the RowGroup if it is empty
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length,
                rowGroup.getRowCount(), getHostAffinity(file, fs, rowGroup.getStartingPos(), length),
                columnMetadataList);

        rowGroupMetadataList.add(rowGroupMeta);
    }
    Path path = Path.getPathWithoutSchemeAndAuthority(file.getPath());

    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}

From source file:org.apache.drill.exec.store.parquet.metadata.Metadata.java

License:Apache License

/**
 * Read the parquet metadata from a file
 *
 * @param path to metadata file/*from  w  w w. j a v a  2s.  c  o m*/
 * @param dirsOnly true for {@link Metadata#METADATA_DIRECTORIES_FILENAME}
 *                 or false for {@link Metadata#METADATA_FILENAME} files reading
 * @param metaContext current metadata context
 */
private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext metaContext, FileSystem fs) {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    Path metadataParentDir = Path.getPathWithoutSchemeAndAuthority(path.getParent());
    String metadataParentDirPath = metadataParentDir.toUri().getPath();
    ObjectMapper mapper = new ObjectMapper();

    final SimpleModule serialModule = new SimpleModule();
    serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
    serialModule.addKeyDeserializer(Metadata_V2.ColumnTypeMetadata_v2.Key.class,
            new Metadata_V2.ColumnTypeMetadata_v2.Key.DeSerializer());
    serialModule.addKeyDeserializer(ColumnTypeMetadata_v3.Key.class,
            new ColumnTypeMetadata_v3.Key.DeSerializer());

    AfterburnerModule module = new AfterburnerModule();
    module.setUseOptimizedBeanDeserializer(true);

    mapper.registerModule(serialModule);
    mapper.registerModule(module);
    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    try (InputStream is = fs.open(path)) {
        boolean alreadyCheckedModification;
        boolean newMetadata = false;
        alreadyCheckedModification = metaContext.getStatus(metadataParentDirPath);

        if (dirsOnly) {
            parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
            if (timer != null) {
                logger.debug("Took {} ms to read directories from directory cache file",
                        timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
            if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), path,
                    metadataParentDir, metaContext, fs)) {
                parquetTableMetadataDirs = (createMetaFilesRecursively(
                        Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null)).getRight();
                newMetadata = true;
            }
        } else {
            parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
            if (timer != null) {
                logger.debug("Took {} ms to read metadata from cache file",
                        timer.elapsed(TimeUnit.MILLISECONDS));
                timer.stop();
            }
            if (new MetadataVersion(parquetTableMetadata.getMetadataVersion())
                    .compareTo(new MetadataVersion(3, 0)) >= 0) {
                ((ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
            }
            if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), path,
                    metadataParentDir, metaContext, fs)) {
                // TODO change with current columns in existing metadata (auto refresh feature)
                parquetTableMetadata = (createMetaFilesRecursively(
                        Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null)).getLeft();
                newMetadata = true;
            }

            // DRILL-5009: Remove the RowGroup if it is empty
            List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
            for (ParquetFileMetadata file : files) {
                List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
                rowGroups.removeIf(r -> r.getRowCount() == 0);
            }

        }
        if (newMetadata) {
            // if new metadata files were created, invalidate the existing metadata context
            metaContext.clear();
        }
    } catch (IOException e) {
        logger.error("Failed to read '{}' metadata file", path, e);
        metaContext.setMetadataCacheCorrupted(true);
    }
}

From source file:org.apache.drill.exec.store.parquet.metadata.MetadataPathUtils.java

License:Apache License

/**
 * Constructs relative path from child full path and base path. Or return child path if the last one is already relative
 *
 * @param childPath full absolute path/*ww  w. j a  va2 s .c o m*/
 * @param baseDir base path (the part of the Path, which should be cut off from child path)
 * @return relative path
 */
public static Path relativize(Path baseDir, Path childPath) {
    Path fullPathWithoutSchemeAndAuthority = Path.getPathWithoutSchemeAndAuthority(childPath);
    Path basePathWithoutSchemeAndAuthority = Path.getPathWithoutSchemeAndAuthority(baseDir);

    // Since hadoop Path hasn't relativize() we use uri.relativize() to get relative path
    Path relativeFilePath = new Path(
            basePathWithoutSchemeAndAuthority.toUri().relativize(fullPathWithoutSchemeAndAuthority.toUri()));
    if (relativeFilePath.isAbsolute()) {
        throw new IllegalStateException(String.format("Path %s is not a subpath of %s.",
                basePathWithoutSchemeAndAuthority.toUri().getPath(),
                fullPathWithoutSchemeAndAuthority.toUri().getPath()));
    }
    return relativeFilePath;
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Get the metadata for a single file//from w w  w  . j  a v a  2  s . co  m
 *
 * @param file
 * @return
 * @throws IOException
 */
private ParquetFileMetadata_v2 getParquetFileMetadata_v2(ParquetTableMetadata_v2 parquetTableMetadata,
        FileStatus file) throws IOException {
    ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
    MessageType schema = metadata.getFileMetaData().getSchema();

    Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
    }

    List<RowGroupMetadata_v2> rowGroupMetadataList = Lists.newArrayList();

    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v2> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v2 columnMetadata;

            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());

            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColumnTypeMetadata_v2 columnTypeMetadata = new ColumnTypeMetadata_v2(columnName, col.getType(),
                    originalTypeMap.get(columnSchemaName));
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v2.Key(columnTypeMetadata.name),
                    columnTypeMetadata);
            if (statsAvailable) {
                // Write stats only if minVal==maxVal. Also, we then store only maxVal
                Object mxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null
                        && stats.genericGetMax().equals(stats.genericGetMin())) {
                    mxValue = stats.genericGetMax();
                }
                columnMetadata = new ColumnMetadata_v2(columnTypeMetadata.name, col.getType(), mxValue,
                        stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v2(columnTypeMetadata.name, col.getType(), null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }

        RowGroupMetadata_v2 rowGroupMeta = new RowGroupMetadata_v2(rowGroup.getStartingPos(), length,
                rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length),
                columnMetadataList);

        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();

    return new ParquetFileMetadata_v2(path, file.getLen(), rowGroupMetadataList);
}

From source file:org.apache.drill.exec.store.parquet.Metadata.java

License:Apache License

/**
 * Read the parquet metadata from a file
 *
 * @param path/* w  ww .jav a  2 s .  c om*/
 * @return
 * @throws IOException
 */
private void readBlockMeta(String path, boolean dirsOnly, MetadataContext metaContext) throws IOException {
    Stopwatch timer = Stopwatch.createStarted();
    Path p = new Path(path);
    Path parentDir = p.getParent(); // parent directory of the metadata file
    ObjectMapper mapper = new ObjectMapper();

    final SimpleModule serialModule = new SimpleModule();
    serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
    serialModule.addKeyDeserializer(ColumnTypeMetadata_v2.Key.class,
            new ColumnTypeMetadata_v2.Key.DeSerializer());

    AfterburnerModule module = new AfterburnerModule();
    module.setUseOptimizedBeanDeserializer(true);

    mapper.registerModule(serialModule);
    mapper.registerModule(module);
    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    FSDataInputStream is = fs.open(p);

    boolean alreadyCheckedModification = false;
    boolean newMetadata = false;

    if (metaContext != null) {
        alreadyCheckedModification = metaContext.getStatus(parentDir.toString());
    }

    if (dirsOnly) {
        parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
        logger.info("Took {} ms to read directories from directory cache file",
                timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
        if (!alreadyCheckedModification
                && tableModified(parquetTableMetadataDirs.getDirectories(), p, parentDir, metaContext)) {
            parquetTableMetadataDirs = (createMetaFilesRecursively(
                    Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getRight();
            newMetadata = true;
        }
    } else {
        parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
        logger.info("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
        if (!alreadyCheckedModification
                && tableModified(parquetTableMetadata.getDirectories(), p, parentDir, metaContext)) {
            parquetTableMetadata = (createMetaFilesRecursively(
                    Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getLeft();
            newMetadata = true;
        }
    }

    if (newMetadata && metaContext != null) {
        // if new metadata files were created, invalidate the existing metadata context
        metaContext.clear();
    }

}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

public void populatePruningVector(ValueVector v, int index, SchemaPath column, String file) {
    String f = Path.getPathWithoutSchemeAndAuthority(new Path(file)).toString();
    MinorType type = getTypeForColumn(column).getMinorType();
    switch (type) {
    case INT: {/*from ww  w .j  a  va 2  s.c om*/
        NullableIntVector intVector = (NullableIntVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, value);
        return;
    }
    case SMALLINT: {
        NullableSmallIntVector smallIntVector = (NullableSmallIntVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        smallIntVector.getMutator().setSafe(index, value.shortValue());
        return;
    }
    case TINYINT: {
        NullableTinyIntVector tinyIntVector = (NullableTinyIntVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        tinyIntVector.getMutator().setSafe(index, value.byteValue());
        return;
    }
    case UINT1: {
        NullableUInt1Vector intVector = (NullableUInt1Vector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, value.byteValue());
        return;
    }
    case UINT2: {
        NullableUInt2Vector intVector = (NullableUInt2Vector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, (char) value.shortValue());
        return;
    }
    case UINT4: {
        NullableUInt4Vector intVector = (NullableUInt4Vector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        intVector.getMutator().setSafe(index, value);
        return;
    }
    case BIGINT: {
        NullableBigIntVector bigIntVector = (NullableBigIntVector) v;
        Long value = (Long) partitionValueMap.get(f).get(column);
        bigIntVector.getMutator().setSafe(index, value);
        return;
    }
    case FLOAT4: {
        NullableFloat4Vector float4Vector = (NullableFloat4Vector) v;
        Float value = (Float) partitionValueMap.get(f).get(column);
        float4Vector.getMutator().setSafe(index, value);
        return;
    }
    case FLOAT8: {
        NullableFloat8Vector float8Vector = (NullableFloat8Vector) v;
        Double value = (Double) partitionValueMap.get(f).get(column);
        float8Vector.getMutator().setSafe(index, value);
        return;
    }
    case VARBINARY: {
        NullableVarBinaryVector varBinaryVector = (NullableVarBinaryVector) v;
        Object s = partitionValueMap.get(f).get(column);
        byte[] bytes;
        if (s instanceof Binary) {
            bytes = ((Binary) s).getBytes();
        } else if (s instanceof String) {
            bytes = ((String) s).getBytes();
        } else if (s instanceof byte[]) {
            bytes = (byte[]) s;
        } else {
            throw new UnsupportedOperationException("Unable to create column data for type: " + type);
        }
        varBinaryVector.getMutator().setSafe(index, bytes, 0, bytes.length);
        return;
    }
    case DECIMAL18: {
        NullableDecimal18Vector decimalVector = (NullableDecimal18Vector) v;
        Long value = (Long) partitionValueMap.get(f).get(column);
        decimalVector.getMutator().setSafe(index, value);
        return;
    }
    case DATE: {
        NullableDateVector dateVector = (NullableDateVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        dateVector.getMutator().setSafe(index,
                DateTimeUtils.fromJulianDay(value - ParquetOutputRecordWriter.JULIAN_DAY_EPOC - 0.5));
        return;
    }
    case TIME: {
        NullableTimeVector timeVector = (NullableTimeVector) v;
        Integer value = (Integer) partitionValueMap.get(f).get(column);
        timeVector.getMutator().setSafe(index, value);
        return;
    }
    case TIMESTAMP: {
        NullableTimeStampVector timeStampVector = (NullableTimeStampVector) v;
        Long value = (Long) partitionValueMap.get(f).get(column);
        timeStampVector.getMutator().setSafe(index, value);
        return;
    }
    case VARCHAR: {
        NullableVarCharVector varCharVector = (NullableVarCharVector) v;
        Object s = partitionValueMap.get(f).get(column);
        byte[] bytes;
        if (s instanceof String) { // if the metadata was read from a JSON cache file it maybe a string type
            bytes = ((String) s).getBytes();
        } else if (s instanceof Binary) {
            bytes = ((Binary) s).getBytes();
        } else if (s instanceof byte[]) {
            bytes = (byte[]) s;
        } else {
            throw new UnsupportedOperationException("Unable to create column data for type: " + type);
        }
        varCharVector.getMutator().setSafe(index, bytes, 0, bytes.length);
        return;
    }
    default:
        throw new UnsupportedOperationException("Unsupported type: " + type);
    }
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

/**
 * Create and return a new file selection based on reading the metadata cache file.
 *
 * This function also initializes a few of ParquetGroupScan's fields as appropriate.
 *
 * @param selection initial file selection
 * @param metaFilePath metadata cache file path
 * @return file selection read from cache
 *
 * @throws IOException//from   ww w . ja va2 s . co m
 * @throws UserException when the updated selection is empty, this happens if the user selects an empty folder.
 */
private FileSelection initFromMetadataCache(FileSelection selection, Path metaFilePath) throws IOException {
    // get the metadata for the root directory by reading the metadata file
    // parquetTableMetadata contains the metadata for all files in the selection root folder, but we need to make sure
    // we only select the files that are part of selection (by setting fileSet appropriately)

    // get (and set internal field) the metadata for the directory by reading the metadata file
    this.parquetTableMetadata = Metadata.readBlockMeta(fs, metaFilePath.toString(), selection.getMetaContext());
    List<FileStatus> fileStatuses = selection.getStatuses(fs);

    if (fileSet == null) {
        fileSet = Sets.newHashSet();
    }

    final Path first = fileStatuses.get(0).getPath();
    if (fileStatuses.size() == 1 && selection.getSelectionRoot().equals(first.toString())) {
        // we are selecting all files from selection root. Expand the file list from the cache
        for (Metadata.ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }

    } else if (selection.isExpandedPartial() && !selection.hadWildcard() && cacheFileRoot != null) {
        if (selection.wasAllPartitionsPruned()) {
            // if all partitions were previously pruned, we only need to read 1 file (for the schema)
            fileSet.add(this.parquetTableMetadata.getFiles().get(0).getPath());
        } else {
            // we are here if the selection is in the expanded_partial state (i.e it has directories).  We get the
            // list of files from the metadata cache file that is present in the cacheFileRoot directory and populate
            // the fileSet. However, this is *not* the final list of files that will be scanned in execution since the
            // second phase of partition pruning will apply on the files and modify the file selection appropriately.
            for (Metadata.ParquetFileMetadata file : this.parquetTableMetadata.getFiles()) {
                fileSet.add(file.getPath());
            }
        }
    } else {
        // we need to expand the files from fileStatuses
        for (FileStatus status : fileStatuses) {
            if (status.isDirectory()) {
                //TODO [DRILL-4496] read the metadata cache files in parallel
                final Path metaPath = new Path(status.getPath(), Metadata.METADATA_FILENAME);
                final Metadata.ParquetTableMetadataBase metadata = Metadata.readBlockMeta(fs,
                        metaPath.toString(), selection.getMetaContext());
                for (Metadata.ParquetFileMetadata file : metadata.getFiles()) {
                    fileSet.add(file.getPath());
                }
            } else {
                final Path path = Path.getPathWithoutSchemeAndAuthority(status.getPath());
                fileSet.add(path.toString());
            }
        }
    }

    if (fileSet.isEmpty()) {
        // no files were found, most likely we tried to query some empty sub folders
        throw UserException.validationError().message("The table you tried to query is empty").build(logger);
    }

    List<String> fileNames = Lists.newArrayList(fileSet);

    // when creating the file selection, set the selection root without the URI prefix
    // The reason is that the file names above have been created in the form
    // /a/b/c.parquet and the format of the selection root must match that of the file names
    // otherwise downstream operations such as partition pruning can break.
    final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(new Path(selection.getSelectionRoot()));
    this.selectionRoot = metaRootPath.toString();

    // Use the FileSelection constructor directly here instead of the FileSelection.create() method
    // because create() changes the root to include the scheme and authority; In future, if create()
    // is the preferred way to instantiate a file selection, we may need to do something different...
    // WARNING: file statuses and file names are inconsistent
    FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames,
            metaRootPath.toString(), cacheFileRoot, selection.wasAllPartitionsPruned());

    newSelection.setExpandedFully();
    newSelection.setMetaContext(selection.getMetaContext());
    return newSelection;
}

From source file:org.apache.drill.exec.store.parquet.ParquetGroupScan.java

License:Apache License

private void init(MetadataContext metaContext) throws IOException {
    if (entries.size() == 1 && parquetTableMetadata == null) {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath()));
        Path metaPath = null;/*from   w  ww.j av  a 2s. c o  m*/
        if (fs.isDirectory(p)) {
            // Using the metadata file makes sense when querying a directory; otherwise
            // if querying a single file we can look up the metadata directly from the file
            metaPath = new Path(p, Metadata.METADATA_FILENAME);
        }
        if (metaPath != null && fs.exists(metaPath)) {
            usedMetadataCache = true;
            parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext);
        } else {
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString());
        }
    } else {
        Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot));
        Path metaPath = new Path(p, Metadata.METADATA_FILENAME);
        if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
            usedMetadataCache = true;
            if (parquetTableMetadata == null) {
                parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext);
            }
            if (fileSet != null) {
                parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
            }
        } else {
            final List<FileStatus> fileStatuses = Lists.newArrayList();
            for (ReadEntryWithPath entry : entries) {
                getFiles(entry.getPath(), fileStatuses);
            }
            parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses);
        }
    }

    if (fileSet == null) {
        fileSet = Sets.newHashSet();
        for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
            fileSet.add(file.getPath());
        }
    }

    Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap();

    for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) {
        hostEndpointMap.put(endpoint.getAddress(), endpoint);
    }

    rowGroupInfos = Lists.newArrayList();
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        int rgIndex = 0;
        for (RowGroupMetadata rg : file.getRowGroups()) {
            RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex,
                    rg.getRowCount());
            EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
            for (String host : rg.getHostAffinity().keySet()) {
                if (hostEndpointMap.containsKey(host)) {
                    endpointByteMap.add(hostEndpointMap.get(host),
                            (long) (rg.getHostAffinity().get(host) * rg.getLength()));
                }
            }
            rowGroupInfo.setEndpointByteMap(endpointByteMap);
            rgIndex++;
            rowGroupInfos.add(rowGroupInfo);
        }
    }

    this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos);

    columnValueCounts = Maps.newHashMap();
    this.rowCount = 0;
    boolean first = true;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            long rowCount = rowGroup.getRowCount();
            for (ColumnMetadata column : rowGroup.getColumns()) {
                SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName());
                Long previousCount = columnValueCounts.get(schemaPath);
                if (previousCount != null) {
                    if (previousCount != GroupScan.NO_COLUMN_STATS) {
                        if (column.getNulls() != null) {
                            Long newCount = rowCount - column.getNulls();
                            columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount);
                        }
                    }
                } else {
                    if (column.getNulls() != null) {
                        Long newCount = rowCount - column.getNulls();
                        columnValueCounts.put(schemaPath, newCount);
                    } else {
                        columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS);
                    }
                }
                boolean partitionColumn = checkForPartitionColumn(column, first);
                if (partitionColumn) {
                    Map<SchemaPath, Object> map = partitionValueMap.get(file.getPath());
                    if (map == null) {
                        map = Maps.newHashMap();
                        partitionValueMap.put(file.getPath(), map);
                    }
                    Object value = map.get(schemaPath);
                    Object currentValue = column.getMaxValue();
                    if (value != null) {
                        if (value != currentValue) {
                            columnTypeMap.remove(schemaPath);
                        }
                    } else {
                        map.put(schemaPath, currentValue);
                    }
                } else {
                    columnTypeMap.remove(schemaPath);
                }
            }
            this.rowCount += rowGroup.getRowCount();
            first = false;
        }
    }
}