Example usage for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.facebook.presto.hive.parquet.HdfsParquetDataSource.java

License:Apache License

public static HdfsParquetDataSource buildHdfsParquetDataSource(FileSystem fileSystem, Path path, long start,
        long length) {
    try {/*ww w .j a  v  a  2  s.  c o m*/
        long size = fileSystem.getFileStatus(path).getLen();
        FSDataInputStream inputStream = fileSystem.open(path);
        return new HdfsParquetDataSource(path, size, inputStream);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed")
                || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT,
                format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length,
                        e.getMessage()),
                e);
    }
}

From source file:com.facebook.presto.hive.parquet.ParquetTester.java

License:Apache License

private static void assertFileContents(JobConf jobConf, ObjectInspector objectInspector, TempFile tempFile,
        Iterable<?> expectedValues, Type type) throws IOException, InterruptedException {
    Path path = new Path(tempFile.getFile().toURI());
    FileSystem fileSystem = path.getFileSystem(jobConf);
    ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
    FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    MessageType fileSchema = fileMetaData.getSchema();

    long size = fileSystem.getFileStatus(path).getLen();
    FSDataInputStream inputStream = fileSystem.open(path);
    ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream);

    TypeManager typeManager = new TypeRegistry();
    ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(),
            dataSource, typeManager);/* w  ww .j  av a2s.co m*/
    assertEquals(parquetReader.getPosition(), 0);

    int rowsProcessed = 0;
    Iterator<?> iterator = expectedValues.iterator();
    for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) {
        ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0);
        Block block = parquetReader.readPrimitive(columnDescriptor, type);
        for (int i = 0; i < batchSize; i++) {
            assertTrue(iterator.hasNext());
            Object expected = iterator.next();
            Object actual = decodeObject(type, block, i);
            assertEquals(actual, expected);
        }
        rowsProcessed += batchSize;
        assertEquals(parquetReader.getPosition(), rowsProcessed);
    }
    assertFalse(iterator.hasNext());

    assertEquals(parquetReader.getPosition(), rowsProcessed);
    parquetReader.close();
}

From source file:com.facebook.presto.hive.parquet.reader.ParquetMetadataReader.java

License:Apache License

public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException {
    FileStatus fileStatus = fileSystem.getFileStatus(file);
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        // Parquet File Layout:
        ////ww  w  .ja v  a 2  s .  c  o  m
        // MAGIC
        // variable: Data
        // variable: Metadata
        // 4 bytes: MetadataLength
        // MAGIC

        long length = fileStatus.getLen();
        validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length,
                "%s is not a valid Parquet File", file);
        long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;

        inputStream.seek(metadataLengthIndex);
        int metadataLength = readIntLittleEndian(inputStream);

        byte[] magic = new byte[MAGIC.length];
        inputStream.readFully(magic);
        validateParquet(Arrays.equals(MAGIC, magic),
                "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC),
                Arrays.toString(magic));

        long metadataIndex = metadataLengthIndex - metadataLength;
        validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
                "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex);
        inputStream.seek(metadataIndex);
        FileMetaData fileMetaData = readFileMetaData(inputStream);
        List<SchemaElement> schema = fileMetaData.getSchema();
        validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

        MessageType messageType = readParquetSchema(schema);
        List<BlockMetaData> blocks = new ArrayList<>();
        List<RowGroup> rowGroups = fileMetaData.getRow_groups();
        if (rowGroups != null) {
            for (RowGroup rowGroup : rowGroups) {
                BlockMetaData blockMetaData = new BlockMetaData();
                blockMetaData.setRowCount(rowGroup.getNum_rows());
                blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
                List<ColumnChunk> columns = rowGroup.getColumns();
                validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
                String filePath = columns.get(0).getFile_path();
                for (ColumnChunk columnChunk : columns) {
                    validateParquet(
                            (filePath == null && columnChunk.getFile_path() == null)
                                    || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                            "all column chunks of the same row group must be in the same file");
                    ColumnMetaData metaData = columnChunk.meta_data;
                    String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
                    ColumnPath columnPath = ColumnPath.get(path);
                    ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath,
                            messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(),
                            CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings),
                            readStats(metaData.statistics,
                                    messageType.getType(columnPath.toArray()).asPrimitiveType()
                                            .getPrimitiveTypeName()),
                            metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values,
                            metaData.total_compressed_size, metaData.total_uncompressed_size);
                    blockMetaData.addColumn(column);
                }
                blockMetaData.setPath(filePath);
                blocks.add(blockMetaData);
            }
        }

        Map<String, String> keyValueMetaData = new HashMap<>();
        List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
        if (keyValueList != null) {
            for (KeyValue keyValue : keyValueList) {
                keyValueMetaData.put(keyValue.key, keyValue.value);
            }
        }
        return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData,
                fileMetaData.getCreated_by()), blocks);
    }
}

From source file:com.facebook.presto.hive.rcfile.RcFilePageSourceFactory.java

License:Apache License

@Override
public Optional<? extends ConnectorPageSource> createPageSource(Configuration configuration,
        ConnectorSession session, Path path, long start, long length, Properties schema,
        List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate,
        DateTimeZone hiveStorageTimeZone) {
    if (!isRcfileOptimizedReaderEnabled(session)) {
        return Optional.empty();
    }// w w  w. j  ava2  s . c om

    RcFileEncoding rcFileEncoding;
    String deserializerClassName = getDeserializerClassName(schema);
    if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
        rcFileEncoding = new BinaryRcFileEncoding();
    } else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
        rcFileEncoding = createTextVectorEncoding(schema, hiveStorageTimeZone);
    } else {
        return Optional.empty();
    }

    long size;
    FSDataInputStream inputStream;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        size = fileSystem.getFileStatus(path).getLen();
        inputStream = fileSystem.open(path);
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }

    try {
        ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
        for (HiveColumnHandle column : columns) {
            readColumns.put(column.getHiveColumnIndex(), column.getHiveType().getType(typeManager));
        }

        RcFileReader rcFileReader = new RcFileReader(
                new HdfsRcFileDataSource(path.toString(), inputStream, size), rcFileEncoding,
                readColumns.build(),
                new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start,
                length, new DataSize(1, Unit.MEGABYTE));

        return Optional.of(new RcFilePageSource(rcFileReader, columns, hiveStorageTimeZone, typeManager));
    } catch (Throwable e) {
        try {
            inputStream.close();
        } catch (IOException ignored) {
        }
        throw Throwables.propagate(e);
    }
}

From source file:com.facebook.presto.hive.RcFileFileWriterFactory.java

License:Apache License

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames,
        StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session) {
    if (!HiveSessionProperties.isRcfileOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }/*from www  .  j  a v  a 2  s. c  o m*/

    if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }

    RcFileEncoding rcFileEncoding;
    if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = new BinaryRcFileEncoding();
    } else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = createTextVectorEncoding(schema, hiveStorageTimeZone);
    } else {
        return Optional.empty();
    }

    Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC));

    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings()
            .splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
    List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream()
            .map(hiveType -> hiveType.getType(typeManager)).collect(toList());

    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();

    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        OutputStream outputStream = fileSystem.create(path);

        Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty();
        if (HiveSessionProperties.isRcfileOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path),
                            fileSystem.getFileStatus(path).getLen(), stats);
                } catch (IOException e) {
                    throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }

        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };

        return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes,
                codecName, fileInputColumnIndexes,
                ImmutableMap.<String, String>builder()
                        .put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString())
                        .put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(),
                validationInputFactory));
    } catch (Exception e) {
        throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e);
    }
}

From source file:com.facebook.presto.parquet.reader.MetadataReader.java

License:Apache License

public static ParquetMetadata readFooter(FileSystem fileSystem, Path file, long fileSize) throws IOException {
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        return readFooter(inputStream, file, fileSize);
    }/*from   w  w  w .java  2s .  c  o m*/
}

From source file:com.finderbots.miner.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file sysytem
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }/* w ww . ja v a2 s .c  o m*/
        }
        in.close();
    }
    return filterList;
}

From source file:com.fullcontact.cassandra.io.compress.CompressionMetadata.java

License:Apache License

@VisibleForTesting
CompressionMetadata(String indexFilePath, long compressedLength, FileSystem fs) {
    this.indexFilePath = indexFilePath;

    DataInputStream stream;/* ww w .  j a  v a  2 s  . c o m*/
    try {
        stream = fs.open(new Path(indexFilePath));
    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    try {
        String compressorName = stream.readUTF();
        int optionCount = stream.readInt();
        Map<String, String> options = new HashMap<String, String>();
        for (int i = 0; i < optionCount; ++i) {
            String key = stream.readUTF();
            String value = stream.readUTF();
            options.put(key, value);
        }
        int chunkLength = stream.readInt();
        try {
            parameters = new CompressionParameters(compressorName, chunkLength, options);
        } catch (ConfigurationException e) {
            throw new RuntimeException("Cannot create CompressionParameters for stored parameters", e);
        }

        dataLength = stream.readLong();
        compressedFileLength = compressedLength;
        chunkOffsets = readChunkOffsets(stream);
    } catch (IOException e) {
        throw new CorruptSSTableException(e, indexFilePath);
    } finally {
        FileUtils.closeQuietly(stream);
    }
}

From source file:com.fullcontact.cassandra.io.util.RandomAccessReader.java

License:Apache License

protected RandomAccessReader(Path file, int bufferSize, boolean skipIOCache, PoolingSegmentedFile owner,
        FileSystem fs) throws FileNotFoundException {
    inputPath = file;/*w ww  .  j  a  v a2  s  . com*/
    try {
        inputFileStatus = fs.getFileStatus(inputPath);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    this.fs = fs;

    try {
        this.input = fs.open(file);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    this.owner = owner;

    filePath = file.toString();

    // allocating required size of the buffer
    if (bufferSize <= 0)
        throw new IllegalArgumentException("bufferSize must be positive");
    buffer = new byte[bufferSize];

    this.skipIOCache = skipIOCache;

    // we can cache file length in read-only mode
    try {
        fileLength = fs.getFileStatus(file).getLen();
    } catch (IOException e) {
        throw new FSReadError(e, filePath);
    }
    validBufferBytes = -1; // that will trigger reBuffer() on demand by read/seek operations
}

From source file:com.fullcontact.sstable.hadoop.IndexOffsetScanner.java

License:Apache License

/**
 * Hadoop fs based version./*from   w ww. j  av  a  2s  .c o  m*/
 *
 * @param filename File name.
 * @param fileSystem File system.
 */
public IndexOffsetScanner(final String filename, final FileSystem fileSystem) {
    closer = Closer.create();
    try {
        final FSDataInputStream inputStream = fileSystem.open(new Path(filename));
        this.input = closer.register(new DataInputStream(new FastBufferedInputStream(inputStream)));
    } catch (IOException e) {
        throw new IOError(e);
    }
}