List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.facebook.presto.hive.parquet.HdfsParquetDataSource.java
License:Apache License
public static HdfsParquetDataSource buildHdfsParquetDataSource(FileSystem fileSystem, Path path, long start, long length) { try {/*ww w .j a v a 2 s. c o m*/ long size = fileSystem.getFileStatus(path).getLen(); FSDataInputStream inputStream = fileSystem.open(path); return new HdfsParquetDataSource(path, size, inputStream); } catch (Exception e) { if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) { throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e); } throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage()), e); } }
From source file:com.facebook.presto.hive.parquet.ParquetTester.java
License:Apache License
private static void assertFileContents(JobConf jobConf, ObjectInspector objectInspector, TempFile tempFile, Iterable<?> expectedValues, Type type) throws IOException, InterruptedException { Path path = new Path(tempFile.getFile().toURI()); FileSystem fileSystem = path.getFileSystem(jobConf); ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); long size = fileSystem.getFileStatus(path).getLen(); FSDataInputStream inputStream = fileSystem.open(path); ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream); TypeManager typeManager = new TypeRegistry(); ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(), dataSource, typeManager);/* w ww .j av a2s.co m*/ assertEquals(parquetReader.getPosition(), 0); int rowsProcessed = 0; Iterator<?> iterator = expectedValues.iterator(); for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) { ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0); Block block = parquetReader.readPrimitive(columnDescriptor, type); for (int i = 0; i < batchSize; i++) { assertTrue(iterator.hasNext()); Object expected = iterator.next(); Object actual = decodeObject(type, block, i); assertEquals(actual, expected); } rowsProcessed += batchSize; assertEquals(parquetReader.getPosition(), rowsProcessed); } assertFalse(iterator.hasNext()); assertEquals(parquetReader.getPosition(), rowsProcessed); parquetReader.close(); }
From source file:com.facebook.presto.hive.parquet.reader.ParquetMetadataReader.java
License:Apache License
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException { FileStatus fileStatus = fileSystem.getFileStatus(file); try (FSDataInputStream inputStream = fileSystem.open(file)) { // Parquet File Layout: ////ww w .ja v a 2 s . c o m // MAGIC // variable: Data // variable: Metadata // 4 bytes: MetadataLength // MAGIC long length = fileStatus.getLen(); validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file); long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length; inputStream.seek(metadataLengthIndex); int metadataLength = readIntLittleEndian(inputStream); byte[] magic = new byte[MAGIC.length]; inputStream.readFully(magic); validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic)); long metadataIndex = metadataLengthIndex - metadataLength; validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex); inputStream.seek(metadataIndex); FileMetaData fileMetaData = readFileMetaData(inputStream); List<SchemaElement> schema = fileMetaData.getSchema(); validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file); MessageType messageType = readParquetSchema(schema); List<BlockMetaData> blocks = new ArrayList<>(); List<RowGroup> rowGroups = fileMetaData.getRow_groups(); if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { validateParquet( (filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file"); ColumnMetaData metaData = columnChunk.meta_data; String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]); ColumnPath columnPath = ColumnPath.get(path); ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType() .getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<>(); List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata(); if (keyValueList != null) { for (KeyValue keyValue : keyValueList) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks); } }
From source file:com.facebook.presto.hive.rcfile.RcFilePageSourceFactory.java
License:Apache License
@Override public Optional<? extends ConnectorPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone hiveStorageTimeZone) { if (!isRcfileOptimizedReaderEnabled(session)) { return Optional.empty(); }// w w w. j ava2 s . c om RcFileEncoding rcFileEncoding; String deserializerClassName = getDeserializerClassName(schema); if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) { rcFileEncoding = new BinaryRcFileEncoding(); } else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) { rcFileEncoding = createTextVectorEncoding(schema, hiveStorageTimeZone); } else { return Optional.empty(); } long size; FSDataInputStream inputStream; try { FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); size = fileSystem.getFileStatus(path).getLen(); inputStream = fileSystem.open(path); } catch (Exception e) { throw Throwables.propagate(e); } try { ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder(); for (HiveColumnHandle column : columns) { readColumns.put(column.getHiveColumnIndex(), column.getHiveType().getType(typeManager)); } RcFileReader rcFileReader = new RcFileReader( new HdfsRcFileDataSource(path.toString(), inputStream, size), rcFileEncoding, readColumns.build(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, new DataSize(1, Unit.MEGABYTE)); return Optional.of(new RcFilePageSource(rcFileReader, columns, hiveStorageTimeZone, typeManager)); } catch (Throwable e) { try { inputStream.close(); } catch (IOException ignored) { } throw Throwables.propagate(e); } }
From source file:com.facebook.presto.hive.RcFileFileWriterFactory.java
License:Apache License
@Override public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session) { if (!HiveSessionProperties.isRcfileOptimizedWriterEnabled(session)) { return Optional.empty(); }/*from www . j a v a 2 s. c o m*/ if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) { return Optional.empty(); } RcFileEncoding rcFileEncoding; if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) { rcFileEncoding = new BinaryRcFileEncoding(); } else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) { rcFileEncoding = createTextVectorEncoding(schema, hiveStorageTimeZone); } else { return Optional.empty(); } Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC)); // existing tables and partitions may have columns in a different order than the writer is providing, so build // an index to rearrange columns in the proper order List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings() .splitToList(schema.getProperty(META_TABLE_COLUMNS, "")); List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream() .map(hiveType -> hiveType.getType(typeManager)).collect(toList()); int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray(); try { FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); OutputStream outputStream = fileSystem.create(path); Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty(); if (HiveSessionProperties.isRcfileOptimizedWriterValidate(session)) { validationInputFactory = Optional.of(() -> { try { return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path), fileSystem.getFileStatus(path).getLen(), stats); } catch (IOException e) { throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e); } }); } Callable<Void> rollbackAction = () -> { fileSystem.delete(path, false); return null; }; return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes, codecName, fileInputColumnIndexes, ImmutableMap.<String, String>builder() .put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()) .put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), validationInputFactory)); } catch (Exception e) { throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e); } }
From source file:com.facebook.presto.parquet.reader.MetadataReader.java
License:Apache License
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file, long fileSize) throws IOException { try (FSDataInputStream inputStream = fileSystem.open(file)) { return readFooter(inputStream, file, fileSize); }/*from w w w .java 2s . c o m*/ }
From source file:com.finderbots.miner.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file sysytem JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }/* w ww . ja v a2 s .c o m*/ } in.close(); } return filterList; }
From source file:com.fullcontact.cassandra.io.compress.CompressionMetadata.java
License:Apache License
@VisibleForTesting CompressionMetadata(String indexFilePath, long compressedLength, FileSystem fs) { this.indexFilePath = indexFilePath; DataInputStream stream;/* ww w . j a v a 2 s . c o m*/ try { stream = fs.open(new Path(indexFilePath)); } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } try { String compressorName = stream.readUTF(); int optionCount = stream.readInt(); Map<String, String> options = new HashMap<String, String>(); for (int i = 0; i < optionCount; ++i) { String key = stream.readUTF(); String value = stream.readUTF(); options.put(key, value); } int chunkLength = stream.readInt(); try { parameters = new CompressionParameters(compressorName, chunkLength, options); } catch (ConfigurationException e) { throw new RuntimeException("Cannot create CompressionParameters for stored parameters", e); } dataLength = stream.readLong(); compressedFileLength = compressedLength; chunkOffsets = readChunkOffsets(stream); } catch (IOException e) { throw new CorruptSSTableException(e, indexFilePath); } finally { FileUtils.closeQuietly(stream); } }
From source file:com.fullcontact.cassandra.io.util.RandomAccessReader.java
License:Apache License
protected RandomAccessReader(Path file, int bufferSize, boolean skipIOCache, PoolingSegmentedFile owner, FileSystem fs) throws FileNotFoundException { inputPath = file;/*w ww . j a v a2 s . com*/ try { inputFileStatus = fs.getFileStatus(inputPath); } catch (IOException e) { throw new RuntimeException(e); } this.fs = fs; try { this.input = fs.open(file); } catch (IOException e) { throw new RuntimeException(e); } this.owner = owner; filePath = file.toString(); // allocating required size of the buffer if (bufferSize <= 0) throw new IllegalArgumentException("bufferSize must be positive"); buffer = new byte[bufferSize]; this.skipIOCache = skipIOCache; // we can cache file length in read-only mode try { fileLength = fs.getFileStatus(file).getLen(); } catch (IOException e) { throw new FSReadError(e, filePath); } validBufferBytes = -1; // that will trigger reBuffer() on demand by read/seek operations }
From source file:com.fullcontact.sstable.hadoop.IndexOffsetScanner.java
License:Apache License
/** * Hadoop fs based version./*from w ww. j av a 2s .c o m*/ * * @param filename File name. * @param fileSystem File system. */ public IndexOffsetScanner(final String filename, final FileSystem fileSystem) { closer = Closer.create(); try { final FSDataInputStream inputStream = fileSystem.open(new Path(filename)); this.input = closer.register(new DataInputStream(new FastBufferedInputStream(inputStream))); } catch (IOException e) { throw new IOError(e); } }