Example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getStart

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getStart.

Prototype

public long getStart()

Source Link

Document

The position of the first byte in the file to process.

Usage

From source file:hadoop.TweetRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    start = split.getStart();
    end = start + split.getLength();//w  ww  .  j a va  2  s. c o  m
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }

    this.pos = start;
}

From source file:InvertedIndex.NLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.job = job;
    this.context = context;
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();//from  w ww.j  a  v a 2 s . com
    final Path file = split.getPath();
    this.path = file;
    this.length = split.getLength();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (0 == split.getLength() && job.getBoolean("mapred.ignore.badcompress", false)) {
            if (null != context && context instanceof TaskInputOutputContext) {
                ((TaskInputOutputContext) context).getCounter("Input Counter", "Gzip File length is zero")
                        .increment(1);
            }
            if (null != this.path) {
                LOG.warn("Skip 0-length Zip file: " + this.path.toString());
            }
            in = new NLineReader(fileIn, job);
        } else {
            try {
                in = new NLineReader(codec.createInputStream(fileIn), job);
                end = Long.MAX_VALUE;
            } catch (IOException e) {
                if (isIgnoreBadCompress(job, e)) {
                    in = new NLineReader(fileIn, job);
                    end = start;
                    LOG.warn("Skip Bad Compress File: " + this.path.toString());
                    LOG.warn("initialize line read error", e);
                    ((TaskInputOutputContext) context).getCounter("Input Counter", "Skip Bad Zip File")
                            .increment(1);
                    ((TaskInputOutputContext) context).getCounter("Input Counter", "Total Skip Bad Zip Length")
                            .increment(this.length);
                } else {
                    throw e;
                }
            }
        }
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new NLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:io.ssc.trackthetrackers.extraction.hadoop.io.ArcRecordReader.java

License:Open Source License

public void initialize(InputSplit insplit, TaskAttemptContext context) throws IOException {

    conf = context.getConfiguration();//from   ww w. ja  v a 2 s.  c o m

    FileSplit split = (FileSplit) insplit;

    if (split.getStart() != 0) {
        String errorMessage = "Invalid ARC file split start " + split.getStart()
                + ": ARC files are not splittable";
        log.error(errorMessage);
        throw new IOException(errorMessage);
    }

    // open the file and seek to the start of the split
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(context.getConfiguration());

    fsin = fs.open(file);

    // create a GZIP stream that *does not* automatically read through
    // members
    gzip = new GzipCompressorInputStream(fsin, false);

    fileLength = fs.getFileStatus(file).getLen();

    // First record should be an ARC file header record. Skip it.
    skipRecord();
}

From source file:it.crs4.features.BioImgRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    globalPlaneIdx = (int) split.getStart();
    nPlanes = (int) split.getLength();
    Path file = split.getPath();/*  www. jav  a  2s.c  o m*/
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    String absPathName = fs.getFileStatus(file).getPath().toString();
    reader = new ImageReader();
    try {
        reader.setId(absPathName);
    } catch (FormatException e) {
        throw new RuntimeException("FormatException: " + e.getMessage());
    }
    planesPerSeries = reader.getImageCount();
    factory = new BioImgFactory(reader, absPathName);
    name = PathTools.stripext(PathTools.basename(absPathName));
    planeCounter = 0;
}

From source file:it.crs4.pydoop.mapreduce.pipes.PydoopAvroRecordReaderBase.java

License:Apache License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    if (!(inputSplit instanceof FileSplit)) {
        throw new IllegalArgumentException("Only compatible with FileSplits.");
    }/*  ww w  .j  ava 2s  .  c  o m*/
    FileSplit fileSplit = (FileSplit) inputSplit;
    SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath());
    mAvroFileReader = new DataFileReader<GenericRecord>(seekableFileInput,
            new GenericDatumReader<GenericRecord>(mReaderSchema));
    // We will read the first block that begins after the input split
    // start; we will read up to but not including the first block
    // that begins after the input split end.
    mAvroFileReader.sync(fileSplit.getStart());
    mStartPosition = mAvroFileReader.previousSync();
    mEndPosition = fileSplit.getStart() + fileSplit.getLength();
}

From source file:it.crs4.seal.common.SamInputFormat.java

License:Open Source License

@Override
public RecordReader<LongWritable, ReadPair> createRecordReader(InputSplit split, TaskAttemptContext context) {
    FileSplit fsplit = (FileSplit) split;

    if (fsplit.getStart() > 0 && !isSplitable(context, fsplit.getPath()))
        throw new RuntimeException("Trying to split non-splittable file " + fsplit.getPath());

    return new SamRecordReader();
}

From source file:it.prz.jmatrw4spark.JMATFileRecordReader.java

License:Open Source License

public void initialize(InputSplit baseSplit, TaskAttemptContext ctx) throws IOException, InterruptedException {
    Configuration cfg = ctx.getConfiguration();

    FileSplit fileSplit = (FileSplit) baseSplit;
    Path filePath = fileSplit.getPath();

    FileSystem fs = filePath.getFileSystem(cfg);
    FSDataInputStream dis = fs.open(fileSplit.getPath());

    //Initialise the block boundaries.
    lBlockStart = fileSplit.getStart();
    lBlockLength = fileSplit.getLength();
    lBlockEnd = lBlockStart + lBlockLength;
    lBlockCurPos = lBlockStart;//  w w  w  .  j a  v  a2 s.  c om

    //Initialise the object to read the *.mat file.
    _matReader = new JMATReader(dis);

    //move the file pointer to the start location.
    _matReader.seek(lBlockStart, new Seeker() {
        @Override
        public boolean seekTo(long lBytePos, InputStream is) throws IOException {
            if (is instanceof FSDataInputStream == false)
                throw new UnsupportedSeekOperation("Unknown input stream " + is.getClass().getName());

            ((FSDataInputStream) is).seek(lBytePos);

            return true;
        }
    });
}

From source file:ivory.core.preprocess.PositionalSequenceFileRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    conf = context.getConfiguration();//from w ww  .  j  av  a 2  s .co m
    Path path = fileSplit.getPath();
    FileSystem fs = path.getFileSystem(conf);
    this.in = new SequenceFile.Reader(fs, path, conf);
    this.end = fileSplit.getStart() + fileSplit.getLength();

    if (fileSplit.getStart() > in.getPosition()) {
        in.sync(fileSplit.getStart()); // sync to start
    }

    this.start = in.getPosition();
    more = start < end;
}

From source file:kogiri.common.hadoop.io.reader.fasta.FastaRawReadReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    this.filename = file.getName();

    this.firstRead = true;

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);

    // get uncompressed length
    if (codec instanceof GzipCodec) {
        this.isCompressed = true;

        FSDataInputStream fileInCheckSize = fs.open(file);
        byte[] len = new byte[4];
        try {//  w  w  w  . j av a  2s  . co m
            LOG.info("compressed input : " + file.getName());
            LOG.info("compressed file size : " + this.end);
            fileInCheckSize.skip(this.end - 4);
            IOUtils.readFully(fileInCheckSize, len, 0, len.length);
            this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
            if (this.uncompressedSize < 0) {
                this.uncompressedSize = this.end;
            }
            LOG.info("uncompressed file size : " + this.uncompressedSize);
        } finally {
            fileInCheckSize.close();
        }

        this.end = Long.MAX_VALUE;
    } else if (codec != null) {
        this.isCompressed = true;
        this.end = Long.MAX_VALUE;
        this.uncompressedSize = Long.MAX_VALUE;
    } else {
        this.isCompressed = false;
    }

    // get inputstream
    FSDataInputStream fileIn = fs.open(file);

    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), job);
    } else {
        if (this.start != 0) {
            fileIn.seek(this.start);
        }
        this.in = new LineReader(fileIn, job);
    }

    // skip lines until we meet new read start
    while (this.start < this.end) {
        Text skipText = new Text();
        long newSize = this.in.readLine(skipText, this.maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength));
        if (newSize == 0) {
            // EOF
            this.hasNextRead = false;
            this.pos = this.end;
            break;
        }

        if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) {
            this.prevLine = skipText;
            this.prevSize = newSize;
            this.hasNextRead = true;
            this.pos = this.start;
            break;
        }

        this.start += newSize;

        if (this.start >= this.end) {
            // EOF
            this.hasNextRead = false;
            this.pos = this.end;
            break;
        }
    }

    this.key = null;
    this.value = null;
}

From source file:kogiri.common.hadoop.io.reader.fasta.FastaReadDescriptionReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    this.filename = file.getName();

    this.firstRead = true;

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);

    // get uncompressed length
    if (codec instanceof GzipCodec) {
        this.isCompressed = true;

        FSDataInputStream fileInCheckSize = fs.open(file);
        byte[] len = new byte[4];
        try {//  ww  w.  j a  va  2s . com
            LOG.info("compressed input : " + file.getName());
            LOG.info("compressed file size : " + this.end);
            fileInCheckSize.skip(this.end - 4);
            IOUtils.readFully(fileInCheckSize, len, 0, len.length);
            this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
            if (this.uncompressedSize < 0) {
                this.uncompressedSize = this.end;
            }
            LOG.info("uncompressed file size : " + this.uncompressedSize);
        } finally {
            fileInCheckSize.close();
        }

        this.end = Long.MAX_VALUE;
    } else if (codec != null) {
        this.isCompressed = true;
        this.end = Long.MAX_VALUE;
        this.uncompressedSize = Long.MAX_VALUE;
    } else {
        this.isCompressed = false;
    }

    // get inputstream
    FSDataInputStream fileIn = fs.open(file);

    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), job);
    } else {
        if (this.start != 0) {
            fileIn.seek(this.start);
        }
        this.in = new LineReader(fileIn, job);
    }

    // skip lines until we meet new record start
    while (this.start < this.end) {
        Text skipText = new Text();
        long newSize = this.in.readLine(skipText, this.maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength));
        if (newSize == 0) {
            // EOF
            this.hasNextRecord = false;
            this.pos = this.end;
            break;
        }

        if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) {
            this.prevLine = skipText;
            this.prevSize = newSize;
            this.hasNextRecord = true;
            this.pos = this.start;
            break;
        }

        this.start += newSize;

        if (this.start >= this.end) {
            // EOF
            this.hasNextRecord = false;
            this.pos = this.end;
            break;
        }
    }

    this.key = null;
    this.value = null;
}