Example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getLength

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getLength.

Prototype

@Override
public long getLength() 

Source Link

Document

The number of bytes in the file to process.

Usage

From source file:gov.jgi.meta.hadoop.input.FastaRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from  w w w. j a v  a  2s.  c  om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new FastaLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = false; // don't do this!
            //--start;                      or this
            fileIn.seek(start);
        }
        in = new FastaLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:gov.jgi.meta.hadoop.input.FastqBlockRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from w  w  w. j  av a  2  s.c o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new FastqBlockLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = false; // don't do this!
            //--start;                      or this
            fileIn.seek(start);
        }
        in = new FastqBlockLineReader(fileIn, job);
    }
    this.pos = start;
}

From source file:gov.jgi.meta.hadoop.input.FastqRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//w  w  w .  jav  a  2  s  .  c  o m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new FastqLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = false; // don't do this!
            //--start;                      or this
            fileIn.seek(start);
        }
        in = new FastqLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:gov.llnl.ontology.text.hbase.XMLRecordReader.java

License:Open Source License

/**
 * Extract the {@link Path} for the file to be processed by this {@link
 * XMLRecordReader}./*  w w  w  .  ja v a2s .  co m*/
 */
public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException {
    Configuration config = context.getConfiguration();

    // Get the file stream for the xml file.
    FileSplit split = (FileSplit) isplit;
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(config);
    fsin = (useGzip) ? new GZIPInputStream(fs.open(split.getPath())) : fs.open(split.getPath());
    fsin = new BufferedInputStream(fsin);

    // Setup the limits of the xml file.
    start = split.getStart();
    end = start + split.getLength();
    pos = 0;

    // Get the xml document delmiters for this xml file.
    if (!config.get(DELIMITER_TAG).equals("")) {
        startTag = ("<" + config.get(DELIMITER_TAG)).getBytes();
        endTag = ("</" + config.get(DELIMITER_TAG) + ">").getBytes();
    } else {
        String fileNameBase = file.getName().replace(".xml", "");
        startTag = ("<" + fileNameBase).getBytes();
        endTag = ("</" + fileNameBase).getBytes();
    }
    context.setStatus(file.getName() + " " + pos + " " + end);
}

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    done = false;//w  w  w .j a v  a 2  s  .c om

    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();

    file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    currentValue = new ValueWritable();
    value = new Text();
    tmpValue = new Text();
    tmp = new Text();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR");
    //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS));

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job, recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job, recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;

    setKeySeq(fs, job); //Set currentKey

    nextMyKeyValue(); //Leggo il primo record se esiste.

}

From source file:hadoop.TweetRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    start = split.getStart();//w  w w . j a va  2 s.c o m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }

    this.pos = start;
}

From source file:InvertedIndex.NLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.job = job;
    this.context = context;
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from   w ww.j a v a  2 s  .c o  m
    end = start + split.getLength();
    final Path file = split.getPath();
    this.path = file;
    this.length = split.getLength();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        if (0 == split.getLength() && job.getBoolean("mapred.ignore.badcompress", false)) {
            if (null != context && context instanceof TaskInputOutputContext) {
                ((TaskInputOutputContext) context).getCounter("Input Counter", "Gzip File length is zero")
                        .increment(1);
            }
            if (null != this.path) {
                LOG.warn("Skip 0-length Zip file: " + this.path.toString());
            }
            in = new NLineReader(fileIn, job);
        } else {
            try {
                in = new NLineReader(codec.createInputStream(fileIn), job);
                end = Long.MAX_VALUE;
            } catch (IOException e) {
                if (isIgnoreBadCompress(job, e)) {
                    in = new NLineReader(fileIn, job);
                    end = start;
                    LOG.warn("Skip Bad Compress File: " + this.path.toString());
                    LOG.warn("initialize line read error", e);
                    ((TaskInputOutputContext) context).getCounter("Input Counter", "Skip Bad Zip File")
                            .increment(1);
                    ((TaskInputOutputContext) context).getCounter("Input Counter", "Total Skip Bad Zip Length")
                            .increment(this.length);
                } else {
                    throw e;
                }
            }
        }
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new NLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:it.crs4.features.BioImgRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    globalPlaneIdx = (int) split.getStart();
    nPlanes = (int) split.getLength();
    Path file = split.getPath();//from   w w w  . j  a v a2 s.  c  o  m
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    String absPathName = fs.getFileStatus(file).getPath().toString();
    reader = new ImageReader();
    try {
        reader.setId(absPathName);
    } catch (FormatException e) {
        throw new RuntimeException("FormatException: " + e.getMessage());
    }
    planesPerSeries = reader.getImageCount();
    factory = new BioImgFactory(reader, absPathName);
    name = PathTools.stripext(PathTools.basename(absPathName));
    planeCounter = 0;
}

From source file:it.crs4.pydoop.mapreduce.pipes.PydoopAvroRecordReaderBase.java

License:Apache License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    if (!(inputSplit instanceof FileSplit)) {
        throw new IllegalArgumentException("Only compatible with FileSplits.");
    }/*from  w  w  w  .j a  v a2s  . c  o m*/
    FileSplit fileSplit = (FileSplit) inputSplit;
    SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath());
    mAvroFileReader = new DataFileReader<GenericRecord>(seekableFileInput,
            new GenericDatumReader<GenericRecord>(mReaderSchema));
    // We will read the first block that begins after the input split
    // start; we will read up to but not including the first block
    // that begins after the input split end.
    mAvroFileReader.sync(fileSplit.getStart());
    mStartPosition = mAvroFileReader.previousSync();
    mEndPosition = fileSplit.getStart() + fileSplit.getLength();
}

From source file:it.prz.jmatrw4spark.JMATFileRecordReader.java

License:Open Source License

public void initialize(InputSplit baseSplit, TaskAttemptContext ctx) throws IOException, InterruptedException {
    Configuration cfg = ctx.getConfiguration();

    FileSplit fileSplit = (FileSplit) baseSplit;
    Path filePath = fileSplit.getPath();

    FileSystem fs = filePath.getFileSystem(cfg);
    FSDataInputStream dis = fs.open(fileSplit.getPath());

    //Initialise the block boundaries.
    lBlockStart = fileSplit.getStart();//from w w w .  j ava 2s. co  m
    lBlockLength = fileSplit.getLength();
    lBlockEnd = lBlockStart + lBlockLength;
    lBlockCurPos = lBlockStart;

    //Initialise the object to read the *.mat file.
    _matReader = new JMATReader(dis);

    //move the file pointer to the start location.
    _matReader.seek(lBlockStart, new Seeker() {
        @Override
        public boolean seekTo(long lBytePos, InputStream is) throws IOException {
            if (is instanceof FSDataInputStream == false)
                throw new UnsupportedSeekOperation("Unknown input stream " + is.getClass().getName());

            ((FSDataInputStream) is).seek(lBytePos);

            return true;
        }
    });
}