List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getStart
public long getStart()
From source file:hadoop.TweetRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength();//w ww . j a va 2 s. c o m final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } this.pos = start; }
From source file:InvertedIndex.NLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.job = job; this.context = context; this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength();//from w ww.j a v a 2 s . com final Path file = split.getPath(); this.path = file; this.length = split.getLength(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { if (0 == split.getLength() && job.getBoolean("mapred.ignore.badcompress", false)) { if (null != context && context instanceof TaskInputOutputContext) { ((TaskInputOutputContext) context).getCounter("Input Counter", "Gzip File length is zero") .increment(1); } if (null != this.path) { LOG.warn("Skip 0-length Zip file: " + this.path.toString()); } in = new NLineReader(fileIn, job); } else { try { in = new NLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } catch (IOException e) { if (isIgnoreBadCompress(job, e)) { in = new NLineReader(fileIn, job); end = start; LOG.warn("Skip Bad Compress File: " + this.path.toString()); LOG.warn("initialize line read error", e); ((TaskInputOutputContext) context).getCounter("Input Counter", "Skip Bad Zip File") .increment(1); ((TaskInputOutputContext) context).getCounter("Input Counter", "Total Skip Bad Zip Length") .increment(this.length); } else { throw e; } } } } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new NLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:io.ssc.trackthetrackers.extraction.hadoop.io.ArcRecordReader.java
License:Open Source License
public void initialize(InputSplit insplit, TaskAttemptContext context) throws IOException { conf = context.getConfiguration();//from ww w. ja v a 2 s. c o m FileSplit split = (FileSplit) insplit; if (split.getStart() != 0) { String errorMessage = "Invalid ARC file split start " + split.getStart() + ": ARC files are not splittable"; log.error(errorMessage); throw new IOException(errorMessage); } // open the file and seek to the start of the split final Path file = split.getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); fsin = fs.open(file); // create a GZIP stream that *does not* automatically read through // members gzip = new GzipCompressorInputStream(fsin, false); fileLength = fs.getFileStatus(file).getLen(); // First record should be an ARC file header record. Skip it. skipRecord(); }
From source file:it.crs4.features.BioImgRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; globalPlaneIdx = (int) split.getStart(); nPlanes = (int) split.getLength(); Path file = split.getPath();/* www. jav a 2s.c o m*/ FileSystem fs = file.getFileSystem(context.getConfiguration()); String absPathName = fs.getFileStatus(file).getPath().toString(); reader = new ImageReader(); try { reader.setId(absPathName); } catch (FormatException e) { throw new RuntimeException("FormatException: " + e.getMessage()); } planesPerSeries = reader.getImageCount(); factory = new BioImgFactory(reader, absPathName); name = PathTools.stripext(PathTools.basename(absPathName)); planeCounter = 0; }
From source file:it.crs4.pydoop.mapreduce.pipes.PydoopAvroRecordReaderBase.java
License:Apache License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if (!(inputSplit instanceof FileSplit)) { throw new IllegalArgumentException("Only compatible with FileSplits."); }/* ww w .j ava 2s . c o m*/ FileSplit fileSplit = (FileSplit) inputSplit; SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath()); mAvroFileReader = new DataFileReader<GenericRecord>(seekableFileInput, new GenericDatumReader<GenericRecord>(mReaderSchema)); // We will read the first block that begins after the input split // start; we will read up to but not including the first block // that begins after the input split end. mAvroFileReader.sync(fileSplit.getStart()); mStartPosition = mAvroFileReader.previousSync(); mEndPosition = fileSplit.getStart() + fileSplit.getLength(); }
From source file:it.crs4.seal.common.SamInputFormat.java
License:Open Source License
@Override public RecordReader<LongWritable, ReadPair> createRecordReader(InputSplit split, TaskAttemptContext context) { FileSplit fsplit = (FileSplit) split; if (fsplit.getStart() > 0 && !isSplitable(context, fsplit.getPath())) throw new RuntimeException("Trying to split non-splittable file " + fsplit.getPath()); return new SamRecordReader(); }
From source file:it.prz.jmatrw4spark.JMATFileRecordReader.java
License:Open Source License
public void initialize(InputSplit baseSplit, TaskAttemptContext ctx) throws IOException, InterruptedException { Configuration cfg = ctx.getConfiguration(); FileSplit fileSplit = (FileSplit) baseSplit; Path filePath = fileSplit.getPath(); FileSystem fs = filePath.getFileSystem(cfg); FSDataInputStream dis = fs.open(fileSplit.getPath()); //Initialise the block boundaries. lBlockStart = fileSplit.getStart(); lBlockLength = fileSplit.getLength(); lBlockEnd = lBlockStart + lBlockLength; lBlockCurPos = lBlockStart;// w w w . j a v a2 s. c om //Initialise the object to read the *.mat file. _matReader = new JMATReader(dis); //move the file pointer to the start location. _matReader.seek(lBlockStart, new Seeker() { @Override public boolean seekTo(long lBytePos, InputStream is) throws IOException { if (is instanceof FSDataInputStream == false) throw new UnsupportedSeekOperation("Unknown input stream " + is.getClass().getName()); ((FSDataInputStream) is).seek(lBytePos); return true; } }); }
From source file:ivory.core.preprocess.PositionalSequenceFileRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; conf = context.getConfiguration();//from w ww . j av a 2 s .co m Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(conf); this.in = new SequenceFile.Reader(fs, path, conf); this.end = fileSplit.getStart() + fileSplit.getLength(); if (fileSplit.getStart() > in.getPosition()) { in.sync(fileSplit.getStart()); // sync to start } this.start = in.getPosition(); more = start < end; }
From source file:kogiri.common.hadoop.io.reader.fasta.FastaRawReadReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = this.compressionCodecs.getCodec(file); this.filename = file.getName(); this.firstRead = true; // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); // get uncompressed length if (codec instanceof GzipCodec) { this.isCompressed = true; FSDataInputStream fileInCheckSize = fs.open(file); byte[] len = new byte[4]; try {// w w w . j av a 2s . co m LOG.info("compressed input : " + file.getName()); LOG.info("compressed file size : " + this.end); fileInCheckSize.skip(this.end - 4); IOUtils.readFully(fileInCheckSize, len, 0, len.length); this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0]; if (this.uncompressedSize < 0) { this.uncompressedSize = this.end; } LOG.info("uncompressed file size : " + this.uncompressedSize); } finally { fileInCheckSize.close(); } this.end = Long.MAX_VALUE; } else if (codec != null) { this.isCompressed = true; this.end = Long.MAX_VALUE; this.uncompressedSize = Long.MAX_VALUE; } else { this.isCompressed = false; } // get inputstream FSDataInputStream fileIn = fs.open(file); if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), job); } else { if (this.start != 0) { fileIn.seek(this.start); } this.in = new LineReader(fileIn, job); } // skip lines until we meet new read start while (this.start < this.end) { Text skipText = new Text(); long newSize = this.in.readLine(skipText, this.maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength)); if (newSize == 0) { // EOF this.hasNextRead = false; this.pos = this.end; break; } if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) { this.prevLine = skipText; this.prevSize = newSize; this.hasNextRead = true; this.pos = this.start; break; } this.start += newSize; if (this.start >= this.end) { // EOF this.hasNextRead = false; this.pos = this.end; break; } } this.key = null; this.value = null; }
From source file:kogiri.common.hadoop.io.reader.fasta.FastaReadDescriptionReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = this.compressionCodecs.getCodec(file); this.filename = file.getName(); this.firstRead = true; // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); // get uncompressed length if (codec instanceof GzipCodec) { this.isCompressed = true; FSDataInputStream fileInCheckSize = fs.open(file); byte[] len = new byte[4]; try {// ww w. j a va 2s . com LOG.info("compressed input : " + file.getName()); LOG.info("compressed file size : " + this.end); fileInCheckSize.skip(this.end - 4); IOUtils.readFully(fileInCheckSize, len, 0, len.length); this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0]; if (this.uncompressedSize < 0) { this.uncompressedSize = this.end; } LOG.info("uncompressed file size : " + this.uncompressedSize); } finally { fileInCheckSize.close(); } this.end = Long.MAX_VALUE; } else if (codec != null) { this.isCompressed = true; this.end = Long.MAX_VALUE; this.uncompressedSize = Long.MAX_VALUE; } else { this.isCompressed = false; } // get inputstream FSDataInputStream fileIn = fs.open(file); if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), job); } else { if (this.start != 0) { fileIn.seek(this.start); } this.in = new LineReader(fileIn, job); } // skip lines until we meet new record start while (this.start < this.end) { Text skipText = new Text(); long newSize = this.in.readLine(skipText, this.maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength)); if (newSize == 0) { // EOF this.hasNextRecord = false; this.pos = this.end; break; } if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) { this.prevLine = skipText; this.prevSize = newSize; this.hasNextRecord = true; this.pos = this.start; break; } this.start += newSize; if (this.start >= this.end) { // EOF this.hasNextRecord = false; this.pos = this.end; break; } } this.key = null; this.value = null; }