List of usage examples for org.apache.hadoop.util LineReader readLine
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException
From source file:com.knewton.mrtool.io.JsonRecordReader.java
License:Apache License
/** * Get the line reader to be used for the file. A <code>LineReader</code> can read a file line * by line. This separate method helps with testing too. * /*from w ww. j a va 2 s. c o m*/ * @param fileSplit * @param conf * @return * @throws IOException */ protected LineReader initLineReader(FileSplit fileSplit, Configuration conf) throws IOException { final Path file = fileSplit.getPath(); final CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(fileSplit.getPath()); seekableIn = fileIn; boolean skipFirstLine = false; LineReader lineReader; if (codec != null) { lineReader = new LineReader(codec.createInputStream(fileIn), conf); } else { // if the start is not the beginning of the file then skip the first line to get the // next complete json record. The previous json record will be read by the record reader // that got assigned the previous InputSplit. if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new LineReader(fileIn, conf); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } return lineReader; }
From source file:libra.common.hadoop.io.reader.fasta.FastaKmerReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration conf = context.getConfiguration(); this.kmersize = FastaKmerInputFormat.getKmerSize(conf); this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = this.compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); // get uncompressed length if (codec instanceof GzipCodec) { this.isCompressed = true; FSDataInputStream fileInCheckSize = fs.open(file); byte[] len = new byte[4]; try {/*from w w w . j a v a 2 s . c o m*/ LOG.info("compressed input : " + file.getName()); LOG.info("compressed file size : " + this.end); fileInCheckSize.skip(this.end - 4); IOUtils.readFully(fileInCheckSize, len, 0, len.length); this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0]; if (this.uncompressedSize < 0) { this.uncompressedSize = this.end; } LOG.info("uncompressed file size : " + this.uncompressedSize); } finally { fileInCheckSize.close(); } this.end = Long.MAX_VALUE; } else if (codec != null) { this.isCompressed = true; this.end = Long.MAX_VALUE; this.uncompressedSize = Long.MAX_VALUE; } else { this.isCompressed = false; } // get inputstream FSDataInputStream fileIn = fs.open(file); boolean inTheMiddle = false; if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), conf); } else { if (this.start != 0) { this.start--; fileIn.seek(this.start); inTheMiddle = true; } this.in = new LineReader(fileIn, conf); } this.buffer = new Text(); if (inTheMiddle) { // find new start line this.start += this.in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, this.end - this.start)); // back off FSDataInputStream fileIn2 = fs.open(file); fileIn2.seek(this.start - 1000); LineReader in2 = new LineReader(fileIn2, conf); Text tempLine = new Text(); long curpos = this.start - 1000; while (curpos < this.start) { curpos += in2.readLine(tempLine, 0, (int) (this.start - curpos)); } if (tempLine.charAt(0) == READ_DELIMITER) { // clean start this.buffer.clear(); } else { // leave k-1 seq in the buffer String seq = tempLine.toString().trim(); String left = seq.substring(seq.length() - this.kmersize + 1); this.buffer.set(left); } in2.close(); } this.pos = this.start; this.key = null; this.value = null; }
From source file:nl.basjes.hadoop.io.compress.TestSplittableCodecSeams.java
License:Apache License
/** * Mostly copied from LineRecordReader (MapReduce) to pull an example of * actual usage into this test./* w w w .j av a 2 s. c om*/ */ public boolean nextKeyValue(final SplitCompressionInputStream in, final LineReader lr, final long end, Text value) throws IOException { final int maxLineLength = Integer.MAX_VALUE; if (value == null) { value = new Text(); } int newSize = 0; // We always read one extra line, which lies outside the upper // split limit i.e. (end - 1) while (in.getPos() <= end) { newSize = lr.readLine(value, maxLineLength, maxLineLength); if (newSize == 0) { break; } if (newSize < maxLineLength) { break; } // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (in.getPos() - newSize)); } if (newSize == 0) { value = null; return false; } else { return true; } }