List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:org.apache.druid.storage.hdfs.tasklog.HdfsTaskLogs.java
License:Apache License
private Optional<ByteSource> streamTaskFile(final Path path, final long offset) throws IOException { final FileSystem fs = path.getFileSystem(hadoopConfig); if (fs.exists(path)) { return Optional.of(new ByteSource() { @Override// ww w . j a va 2 s . co m public InputStream openStream() throws IOException { log.info("Reading task log from: %s", path); final long seekPos; if (offset < 0) { final FileStatus stat = fs.getFileStatus(path); seekPos = Math.max(0, stat.getLen() + offset); } else { seekPos = offset; } final FSDataInputStream inputStream = fs.open(path); inputStream.seek(seekPos); log.info("Read task log from: %s (seek = %,d)", path, seekPos); return inputStream; } }); } else { return Optional.absent(); } }
From source file:org.apache.gobblin.data.management.copy.extractor.FileAwareInputStreamExtractor.java
License:Apache License
protected FileAwareInputStream buildStream(FileSystem fsFromFile) throws DataRecordException, IOException { this.recordRead = true; FileAwareInputStream.FileAwareInputStreamBuilder builder = FileAwareInputStream.builder().file(this.file); if (this.file.getFileStatus().isDirectory()) { return builder.inputStream(EmptyInputStream.instance).build(); }/* w w w .j a v a 2 s. c om*/ FSDataInputStream dataInputStream = fsFromFile.open(this.file.getFileStatus().getPath()); if (this.state != null && DistcpFileSplitter.isSplitWorkUnit(this.state)) { Optional<DistcpFileSplitter.Split> split = DistcpFileSplitter.getSplit(this.state); builder.split(split); if (split.isPresent()) { dataInputStream.seek(split.get().getLowPosition()); } } builder.inputStream(MeteredInputStream.builder().in(dataInputStream).build()); return builder.build(); }
From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterTest.java
License:Apache License
@Test public void testBlockWrite() throws Exception { String streamString = "testContents"; FileStatus status = fs.getFileStatus(testTempPath); OwnerAndPermission ownerAndPermission = new OwnerAndPermission(status.getOwner(), status.getGroup(), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); CopyableFile cf = CopyableFileUtils.getTestCopyableFile(ownerAndPermission); CopyableDatasetMetadata metadata = new CopyableDatasetMetadata( new TestCopyableDataset(new Path("/source"))); WorkUnitState state = TestUtils.createTestWorkUnitState(); state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString()); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString()); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5)); state.setProp(DistcpFileSplitter.SPLIT_ENABLED, true); CopySource.serializeCopyEntity(state, cf); CopySource.serializeCopyableDataset(state, metadata); FileAwareInputStreamDataWriter dataWriter = new FileAwareInputStreamDataWriter(state, 1, 0); long splitLen = 4; int splits = (int) (streamString.length() / splitLen + 1); DistcpFileSplitter.Split split = new DistcpFileSplitter.Split(0, splitLen, 0, splits, String.format("%s.__PART%d__", cf.getDestination().getName(), 0)); FSDataInputStream dataInputStream = StreamUtils.convertStream(IOUtils.toInputStream(streamString)); dataInputStream.seek(split.getLowPosition()); FileAwareInputStream fileAwareInputStream = FileAwareInputStream.builder().file(cf) .inputStream(dataInputStream).split(Optional.of(split)).build(); dataWriter.write(fileAwareInputStream); dataWriter.commit();//from w w w . ja v a2 s . c o m Path writtenFilePath = new Path(new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination()); Assert.assertEquals(IOUtils.toString(new FileInputStream(writtenFilePath.toString())), streamString.substring(0, (int) splitLen)); }
From source file:org.apache.hama.bsp.LineRecordReader.java
License:Apache License
public LineRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("bsp.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from www . j a v a 2s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.apache.hawq.pxf.plugins.json.JsonRecordReader.java
License:Apache License
/** * Create new multi-line json object reader. * // w w w . j a v a 2 s . c o m * @param conf * Hadoop context * @param split * HDFS split to start the reading from * @throws IOException IOException when reading the file */ public JsonRecordReader(JobConf conf, FileSplit split) throws IOException { this.jsonMemberName = conf.get(RECORD_MEMBER_IDENTIFIER); this.maxObjectLength = conf.getInt(RECORD_MAX_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); if (codec != null) { is = codec.createInputStream(fileIn); start = 0; end = Long.MAX_VALUE; } else { if (start != 0) { fileIn.seek(start); } is = fileIn; } parser = new PartitionedJsonParser(is); this.pos = start; }
From source file:org.apache.ignite.igfs.HadoopIgfs20FileSystemAbstractSelfTest.java
License:Apache License
/** * Test concurrent reads within the file. * * @throws Exception If failed.//from w w w. ja va2s . c o m */ public void testMultithreadedOpen() throws Exception { final byte[] dataChunk = new byte[256]; for (int i = 0; i < dataChunk.length; i++) dataChunk[i] = (byte) i; Path dir = new Path(new Path(primaryFsUri), "/dir"); fs.mkdir(dir, FsPermission.getDefault(), true); final Path file = new Path(dir, "file"); FSDataOutputStream os = fs.create(file, EnumSet.noneOf(CreateFlag.class), Options.CreateOpts.perms(FsPermission.getDefault())); // Write 256 * 2048 = 512Kb of data. for (int i = 0; i < 2048; i++) os.write(dataChunk); os.close(); final AtomicBoolean err = new AtomicBoolean(); multithreaded(new Runnable() { @Override public void run() { FSDataInputStream is = null; try { int pos = ThreadLocalRandom8.current().nextInt(2048); try { is = fs.open(file); } finally { U.awaitQuiet(barrier); } is.seek(256 * pos); byte[] buf = new byte[256]; for (int i = pos; i < 2048; i++) { // First perform normal read. int read = is.read(buf); assert read == 256; Arrays.equals(dataChunk, buf); } int res = is.read(buf); assert res == -1; } catch (IOException ignore) { err.set(true); } finally { U.closeQuiet(is); } } }, THREAD_CNT); assert !err.get(); }
From source file:org.apache.ignite.igfs.IgfsHadoopFileSystemAbstractSelfTest.java
License:Apache License
/** * Test concurrent reads within the file. * * @throws Exception If failed./* ww w. ja v a2 s. c o m*/ */ public void testMultithreadedOpen() throws Exception { final byte[] dataChunk = new byte[256]; for (int i = 0; i < dataChunk.length; i++) dataChunk[i] = (byte) i; Path dir = new Path(new Path(PRIMARY_URI), "/dir"); assert fs.mkdirs(dir); final Path file = new Path(dir, "file"); FSDataOutputStream os = fs.create(file); // Write 256 * 2048 = 512Kb of data. for (int i = 0; i < 2048; i++) os.write(dataChunk); os.close(); final AtomicBoolean err = new AtomicBoolean(); multithreaded(new Runnable() { @Override public void run() { FSDataInputStream is = null; try { int pos = ThreadLocalRandom8.current().nextInt(2048); try { is = fs.open(file); } finally { U.awaitQuiet(barrier); } is.seek(256 * pos); byte[] buf = new byte[256]; for (int i = pos; i < 2048; i++) { // First perform normal read. int read = is.read(buf); assert read == 256; Arrays.equals(dataChunk, buf); } int res = is.read(buf); assert res == -1; } catch (IOException ignore) { err.set(true); } finally { U.closeQuiet(is); } } }, THREAD_CNT); assert !err.get(); }
From source file:org.apache.jena.grande.mapreduce.io.QuadRecordReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { log.debug("initialize({}, {})", genericSplit, context); FileSplit split = (FileSplit) genericSplit; profile = Utils.createParserProfile(context, split.getPath()); // RIOT configuration Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();// w w w . j a v a 2 s. co m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(file); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractBlockBasedNodeTupleReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { LOG.debug("initialize({}, {})", genericSplit, context); // Assuming file split if (!(genericSplit instanceof FileSplit)) throw new IOException("This record reader only supports FileSplit inputs"); FileSplit split = (FileSplit) genericSplit; // Configuration Configuration config = context.getConfiguration(); this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true); if (this.ignoreBadTuples) LOG.warn(/* w w w .jav a 2 s . co m*/ "Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES); // Figure out what portion of the file to read start = split.getStart(); long end = start + split.getLength(); final Path file = split.getPath(); long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen(); boolean readToEnd = end == totalLength; CompressionCodecFactory factory = new CompressionCodecFactory(config); this.compressionCodecs = factory.getCodec(file); LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength })); // Open the file and prepare the input stream FileSystem fs = file.getFileSystem(config); FSDataInputStream fileIn = fs.open(file); this.length = split.getLength(); if (start > 0) fileIn.seek(start); if (this.compressionCodecs != null) { // Compressed input // For compressed input NLineInputFormat will have failed to find // any line breaks and will give us a split from 0 -> (length - 1) // Add 1 and re-verify readToEnd so we can abort correctly if ever // given a partial split of a compressed file end++; readToEnd = end == totalLength; if (start > 0 || !readToEnd) throw new IOException( "This record reader can only be used with compressed input where the split is a whole file"); input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn)); } else { // Uncompressed input if (readToEnd) { input = new TrackedInputStream(fileIn); } else { // Need to limit the portion of the file we are reading input = new BlockInputStream(fileIn, split.getLength()); } } // Set up background thread for parser iter = this.getPipedIterator(); this.stream = this.getPipedStream(iter, this.input); RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file); Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder); this.parserThread = new Thread(parserRunnable); this.parserThread.setDaemon(true); this.parserThread.start(); }
From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractLineBasedNodeTupleReader.java
License:Apache License
@Override public final void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { LOG.debug("initialize({}, {})", genericSplit, context); // Assuming file split if (!(genericSplit instanceof FileSplit)) throw new IOException("This record reader only supports FileSplit inputs"); FileSplit split = (FileSplit) genericSplit; // Intermediate : RDFParser but need to make a Iterator<Quad/Triple> LabelToNode labelToNode = RdfIOUtils.createLabelToNode(context, split.getPath()); maker = new ParserProfileStd(RiotLib.factoryRDF(labelToNode), ErrorHandlerFactory.errorHandlerStd, IRIResolver.create(), PrefixMapFactory.createForInput(), null, true, false); Configuration config = context.getConfiguration(); this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true); if (this.ignoreBadTuples) LOG.warn(//from ww w . j a v a 2 s . c om "Configured to ignore bad tuples, parsing errors will be logged and the bad line skipped but no errors will be thrownConsider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES); // Figure out what portion of the file to read this.maxLineLength = config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen(); compressionCodecs = new CompressionCodecFactory(config); final CompressionCodec codec = compressionCodecs.getCodec(file); LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength })); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(config); FSDataInputStream fileIn = fs.open(file); boolean skipFirstLine = false; if (codec != null) { // Compressed input // For compressed input NLineInputFormat will have failed to find // any line breaks and will give us a split from 0 -> (length - 1) // Add 1 and verify we got complete split if (totalLength > split.getLength() + 1) throw new IOException( "This record reader can only be used with compressed input where the split covers the whole file"); in = new LineReader(codec.createInputStream(fileIn), config); estLength = end; end = Long.MAX_VALUE; } else { // Uncompressed input if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, config); } // Skip first line and re-establish "start". // This is to do with how line reader reads lines and how // NLineInputFormat will provide the split information to use if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } this.pos = start; }