List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getLength
@Override public long getLength()
From source file:gov.jgi.meta.hadoop.input.FastaRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from w w w. j a v a 2s. c om*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastaLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastaLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:gov.jgi.meta.hadoop.input.FastqBlockRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from w w w. j av a 2 s.c o m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastqBlockLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastqBlockLineReader(fileIn, job); } this.pos = start; }
From source file:gov.jgi.meta.hadoop.input.FastqRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//w w w . jav a 2 s . c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastqLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastqLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:gov.llnl.ontology.text.hbase.XMLRecordReader.java
License:Open Source License
/** * Extract the {@link Path} for the file to be processed by this {@link * XMLRecordReader}./* w w w . ja v a2s . co m*/ */ public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); // Get the file stream for the xml file. FileSplit split = (FileSplit) isplit; Path file = split.getPath(); FileSystem fs = file.getFileSystem(config); fsin = (useGzip) ? new GZIPInputStream(fs.open(split.getPath())) : fs.open(split.getPath()); fsin = new BufferedInputStream(fsin); // Setup the limits of the xml file. start = split.getStart(); end = start + split.getLength(); pos = 0; // Get the xml document delmiters for this xml file. if (!config.get(DELIMITER_TAG).equals("")) { startTag = ("<" + config.get(DELIMITER_TAG)).getBytes(); endTag = ("</" + config.get(DELIMITER_TAG) + ">").getBytes(); } else { String fileNameBase = file.getName().replace(".xml", ""); startTag = ("<" + fileNameBase).getBytes(); endTag = ("</" + fileNameBase).getBytes(); } context.setStatus(file.getName() + " " + pos + " " + end); }
From source file:hadoop.inputsplit.FastaLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); done = false;//w w w .j a v a 2 s .c om this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); currentValue = new ValueWritable(); value = new Text(); tmpValue = new Text(); tmp = new Text(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR"); //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS)); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job, recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job, recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; setKeySeq(fs, job); //Set currentKey nextMyKeyValue(); //Leggo il primo record se esiste. }
From source file:hadoop.TweetRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart();//w w w . j a va 2 s.c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } this.pos = start; }
From source file:InvertedIndex.NLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.job = job; this.context = context; this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from w ww.j a v a 2 s .c o m end = start + split.getLength(); final Path file = split.getPath(); this.path = file; this.length = split.getLength(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { if (0 == split.getLength() && job.getBoolean("mapred.ignore.badcompress", false)) { if (null != context && context instanceof TaskInputOutputContext) { ((TaskInputOutputContext) context).getCounter("Input Counter", "Gzip File length is zero") .increment(1); } if (null != this.path) { LOG.warn("Skip 0-length Zip file: " + this.path.toString()); } in = new NLineReader(fileIn, job); } else { try { in = new NLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } catch (IOException e) { if (isIgnoreBadCompress(job, e)) { in = new NLineReader(fileIn, job); end = start; LOG.warn("Skip Bad Compress File: " + this.path.toString()); LOG.warn("initialize line read error", e); ((TaskInputOutputContext) context).getCounter("Input Counter", "Skip Bad Zip File") .increment(1); ((TaskInputOutputContext) context).getCounter("Input Counter", "Total Skip Bad Zip Length") .increment(this.length); } else { throw e; } } } } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new NLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:it.crs4.features.BioImgRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; globalPlaneIdx = (int) split.getStart(); nPlanes = (int) split.getLength(); Path file = split.getPath();//from w w w . j a v a2 s. c o m FileSystem fs = file.getFileSystem(context.getConfiguration()); String absPathName = fs.getFileStatus(file).getPath().toString(); reader = new ImageReader(); try { reader.setId(absPathName); } catch (FormatException e) { throw new RuntimeException("FormatException: " + e.getMessage()); } planesPerSeries = reader.getImageCount(); factory = new BioImgFactory(reader, absPathName); name = PathTools.stripext(PathTools.basename(absPathName)); planeCounter = 0; }
From source file:it.crs4.pydoop.mapreduce.pipes.PydoopAvroRecordReaderBase.java
License:Apache License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if (!(inputSplit instanceof FileSplit)) { throw new IllegalArgumentException("Only compatible with FileSplits."); }/*from w w w .j a v a2s . c o m*/ FileSplit fileSplit = (FileSplit) inputSplit; SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath()); mAvroFileReader = new DataFileReader<GenericRecord>(seekableFileInput, new GenericDatumReader<GenericRecord>(mReaderSchema)); // We will read the first block that begins after the input split // start; we will read up to but not including the first block // that begins after the input split end. mAvroFileReader.sync(fileSplit.getStart()); mStartPosition = mAvroFileReader.previousSync(); mEndPosition = fileSplit.getStart() + fileSplit.getLength(); }
From source file:it.prz.jmatrw4spark.JMATFileRecordReader.java
License:Open Source License
public void initialize(InputSplit baseSplit, TaskAttemptContext ctx) throws IOException, InterruptedException { Configuration cfg = ctx.getConfiguration(); FileSplit fileSplit = (FileSplit) baseSplit; Path filePath = fileSplit.getPath(); FileSystem fs = filePath.getFileSystem(cfg); FSDataInputStream dis = fs.open(fileSplit.getPath()); //Initialise the block boundaries. lBlockStart = fileSplit.getStart();//from w w w . j ava 2s. co m lBlockLength = fileSplit.getLength(); lBlockEnd = lBlockStart + lBlockLength; lBlockCurPos = lBlockStart; //Initialise the object to read the *.mat file. _matReader = new JMATReader(dis); //move the file pointer to the start location. _matReader.seek(lBlockStart, new Seeker() { @Override public boolean seekTo(long lBytePos, InputStream is) throws IOException { if (is instanceof FSDataInputStream == false) throw new UnsupportedSeekOperation("Unknown input stream " + is.getClass().getName()); ((FSDataInputStream) is).seek(lBytePos); return true; } }); }