Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:org.apache.druid.storage.hdfs.tasklog.HdfsTaskLogs.java

License:Apache License

private Optional<ByteSource> streamTaskFile(final Path path, final long offset) throws IOException {
    final FileSystem fs = path.getFileSystem(hadoopConfig);
    if (fs.exists(path)) {
        return Optional.of(new ByteSource() {
            @Override//  ww w .  j a va 2  s . co  m
            public InputStream openStream() throws IOException {
                log.info("Reading task log from: %s", path);
                final long seekPos;
                if (offset < 0) {
                    final FileStatus stat = fs.getFileStatus(path);
                    seekPos = Math.max(0, stat.getLen() + offset);
                } else {
                    seekPos = offset;
                }
                final FSDataInputStream inputStream = fs.open(path);
                inputStream.seek(seekPos);
                log.info("Read task log from: %s (seek = %,d)", path, seekPos);
                return inputStream;
            }
        });
    } else {
        return Optional.absent();
    }
}

From source file:org.apache.gobblin.data.management.copy.extractor.FileAwareInputStreamExtractor.java

License:Apache License

protected FileAwareInputStream buildStream(FileSystem fsFromFile) throws DataRecordException, IOException {
    this.recordRead = true;
    FileAwareInputStream.FileAwareInputStreamBuilder builder = FileAwareInputStream.builder().file(this.file);
    if (this.file.getFileStatus().isDirectory()) {
        return builder.inputStream(EmptyInputStream.instance).build();
    }/* w w  w .j  a v a 2  s.  c  om*/

    FSDataInputStream dataInputStream = fsFromFile.open(this.file.getFileStatus().getPath());
    if (this.state != null && DistcpFileSplitter.isSplitWorkUnit(this.state)) {
        Optional<DistcpFileSplitter.Split> split = DistcpFileSplitter.getSplit(this.state);
        builder.split(split);
        if (split.isPresent()) {
            dataInputStream.seek(split.get().getLowPosition());
        }
    }
    builder.inputStream(MeteredInputStream.builder().in(dataInputStream).build());
    return builder.build();
}

From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterTest.java

License:Apache License

@Test
public void testBlockWrite() throws Exception {
    String streamString = "testContents";

    FileStatus status = fs.getFileStatus(testTempPath);
    OwnerAndPermission ownerAndPermission = new OwnerAndPermission(status.getOwner(), status.getGroup(),
            new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
    CopyableFile cf = CopyableFileUtils.getTestCopyableFile(ownerAndPermission);

    CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(
            new TestCopyableDataset(new Path("/source")));

    WorkUnitState state = TestUtils.createTestWorkUnitState();
    state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString());
    state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString());
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5));
    state.setProp(DistcpFileSplitter.SPLIT_ENABLED, true);
    CopySource.serializeCopyEntity(state, cf);
    CopySource.serializeCopyableDataset(state, metadata);

    FileAwareInputStreamDataWriter dataWriter = new FileAwareInputStreamDataWriter(state, 1, 0);

    long splitLen = 4;
    int splits = (int) (streamString.length() / splitLen + 1);
    DistcpFileSplitter.Split split = new DistcpFileSplitter.Split(0, splitLen, 0, splits,
            String.format("%s.__PART%d__", cf.getDestination().getName(), 0));
    FSDataInputStream dataInputStream = StreamUtils.convertStream(IOUtils.toInputStream(streamString));
    dataInputStream.seek(split.getLowPosition());
    FileAwareInputStream fileAwareInputStream = FileAwareInputStream.builder().file(cf)
            .inputStream(dataInputStream).split(Optional.of(split)).build();
    dataWriter.write(fileAwareInputStream);
    dataWriter.commit();//from   w w w  . ja v  a2  s .  c  o  m
    Path writtenFilePath = new Path(new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR),
            cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination());
    Assert.assertEquals(IOUtils.toString(new FileInputStream(writtenFilePath.toString())),
            streamString.substring(0, (int) splitLen));
}

From source file:org.apache.hama.bsp.LineRecordReader.java

License:Apache License

public LineRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("bsp.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from   www  . j  a  v a  2s  . c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.apache.hawq.pxf.plugins.json.JsonRecordReader.java

License:Apache License

/**
 * Create new multi-line json object reader.
 * //  w  w  w  .  j  a v a 2 s . c o  m
 * @param conf
 *            Hadoop context
 * @param split
 *            HDFS split to start the reading from
 * @throws IOException IOException when reading the file
 */
public JsonRecordReader(JobConf conf, FileSplit split) throws IOException {

    this.jsonMemberName = conf.get(RECORD_MEMBER_IDENTIFIER);
    this.maxObjectLength = conf.getInt(RECORD_MAX_LENGTH, Integer.MAX_VALUE);

    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(split.getPath());
    if (codec != null) {
        is = codec.createInputStream(fileIn);
        start = 0;
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            fileIn.seek(start);
        }
        is = fileIn;
    }
    parser = new PartitionedJsonParser(is);
    this.pos = start;
}

From source file:org.apache.ignite.igfs.HadoopIgfs20FileSystemAbstractSelfTest.java

License:Apache License

/**
 * Test concurrent reads within the file.
 *
 * @throws Exception If failed.//from w w w.  ja va2s  .  c o m
 */
public void testMultithreadedOpen() throws Exception {
    final byte[] dataChunk = new byte[256];

    for (int i = 0; i < dataChunk.length; i++)
        dataChunk[i] = (byte) i;

    Path dir = new Path(new Path(primaryFsUri), "/dir");

    fs.mkdir(dir, FsPermission.getDefault(), true);

    final Path file = new Path(dir, "file");

    FSDataOutputStream os = fs.create(file, EnumSet.noneOf(CreateFlag.class),
            Options.CreateOpts.perms(FsPermission.getDefault()));

    // Write 256 * 2048 = 512Kb of data.
    for (int i = 0; i < 2048; i++)
        os.write(dataChunk);

    os.close();

    final AtomicBoolean err = new AtomicBoolean();

    multithreaded(new Runnable() {
        @Override
        public void run() {
            FSDataInputStream is = null;

            try {
                int pos = ThreadLocalRandom8.current().nextInt(2048);

                try {
                    is = fs.open(file);
                } finally {
                    U.awaitQuiet(barrier);
                }

                is.seek(256 * pos);

                byte[] buf = new byte[256];

                for (int i = pos; i < 2048; i++) {
                    // First perform normal read.
                    int read = is.read(buf);

                    assert read == 256;

                    Arrays.equals(dataChunk, buf);
                }

                int res = is.read(buf);

                assert res == -1;
            } catch (IOException ignore) {
                err.set(true);
            } finally {
                U.closeQuiet(is);
            }
        }
    }, THREAD_CNT);

    assert !err.get();
}

From source file:org.apache.ignite.igfs.IgfsHadoopFileSystemAbstractSelfTest.java

License:Apache License

/**
 * Test concurrent reads within the file.
 *
 * @throws Exception If failed./*  ww w. ja  v  a2  s.  c  o m*/
 */
public void testMultithreadedOpen() throws Exception {
    final byte[] dataChunk = new byte[256];

    for (int i = 0; i < dataChunk.length; i++)
        dataChunk[i] = (byte) i;

    Path dir = new Path(new Path(PRIMARY_URI), "/dir");

    assert fs.mkdirs(dir);

    final Path file = new Path(dir, "file");

    FSDataOutputStream os = fs.create(file);

    // Write 256 * 2048 = 512Kb of data.
    for (int i = 0; i < 2048; i++)
        os.write(dataChunk);

    os.close();

    final AtomicBoolean err = new AtomicBoolean();

    multithreaded(new Runnable() {
        @Override
        public void run() {
            FSDataInputStream is = null;

            try {
                int pos = ThreadLocalRandom8.current().nextInt(2048);

                try {
                    is = fs.open(file);
                } finally {
                    U.awaitQuiet(barrier);
                }

                is.seek(256 * pos);

                byte[] buf = new byte[256];

                for (int i = pos; i < 2048; i++) {
                    // First perform normal read.
                    int read = is.read(buf);

                    assert read == 256;

                    Arrays.equals(dataChunk, buf);
                }

                int res = is.read(buf);

                assert res == -1;
            } catch (IOException ignore) {
                err.set(true);
            } finally {
                U.closeQuiet(is);
            }
        }
    }, THREAD_CNT);

    assert !err.get();
}

From source file:org.apache.jena.grande.mapreduce.io.QuadRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    log.debug("initialize({}, {})", genericSplit, context);

    FileSplit split = (FileSplit) genericSplit;
    profile = Utils.createParserProfile(context, split.getPath()); // RIOT configuration
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();// w w w . j a  v  a 2  s.  co m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractBlockBasedNodeTupleReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);

    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;

    // Configuration
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn(/* w  w w .jav  a  2 s  . co  m*/
                "Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown.  Consider setting {} to false to disable this behaviour",
                RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);

    // Figure out what portion of the file to read
    start = split.getStart();
    long end = start + split.getLength();
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    boolean readToEnd = end == totalLength;
    CompressionCodecFactory factory = new CompressionCodecFactory(config);
    this.compressionCodecs = factory.getCodec(file);

    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d",
            new Object[] { start, split.getLength(), totalLength }));

    // Open the file and prepare the input stream
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    this.length = split.getLength();
    if (start > 0)
        fileIn.seek(start);

    if (this.compressionCodecs != null) {
        // Compressed input
        // For compressed input NLineInputFormat will have failed to find
        // any line breaks and will give us a split from 0 -> (length - 1)
        // Add 1 and re-verify readToEnd so we can abort correctly if ever
        // given a partial split of a compressed file
        end++;
        readToEnd = end == totalLength;
        if (start > 0 || !readToEnd)
            throw new IOException(
                    "This record reader can only be used with compressed input where the split is a whole file");
        input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
    } else {
        // Uncompressed input

        if (readToEnd) {
            input = new TrackedInputStream(fileIn);
        } else {
            // Need to limit the portion of the file we are reading
            input = new BlockInputStream(fileIn, split.getLength());
        }
    }

    // Set up background thread for parser
    iter = this.getPipedIterator();
    this.stream = this.getPipedStream(iter, this.input);
    RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
    Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);

    this.parserThread = new Thread(parserRunnable);
    this.parserThread.setDaemon(true);
    this.parserThread.start();
}

From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractLineBasedNodeTupleReader.java

License:Apache License

@Override
public final void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);

    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;

    // Intermediate : RDFParser but need to make a Iterator<Quad/Triple>
    LabelToNode labelToNode = RdfIOUtils.createLabelToNode(context, split.getPath());
    maker = new ParserProfileStd(RiotLib.factoryRDF(labelToNode), ErrorHandlerFactory.errorHandlerStd,
            IRIResolver.create(), PrefixMapFactory.createForInput(), null, true, false);

    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn(//from ww w  .  j  a  v  a 2  s  . c om
                "Configured to ignore bad tuples, parsing errors will be logged and the bad line skipped but no errors will be thrownConsider setting {} to false to disable this behaviour",
                RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);

    // Figure out what portion of the file to read
    this.maxLineLength = config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    compressionCodecs = new CompressionCodecFactory(config);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d",
            new Object[] { start, split.getLength(), totalLength }));

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        // Compressed input
        // For compressed input NLineInputFormat will have failed to find
        // any line breaks and will give us a split from 0 -> (length - 1)
        // Add 1 and verify we got complete split
        if (totalLength > split.getLength() + 1)
            throw new IOException(
                    "This record reader can only be used with compressed input where the split covers the whole file");
        in = new LineReader(codec.createInputStream(fileIn), config);
        estLength = end;
        end = Long.MAX_VALUE;
    } else {
        // Uncompressed input
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, config);
    }
    // Skip first line and re-establish "start".
    // This is to do with how line reader reads lines and how
    // NLineInputFormat will provide the split information to use
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}