List of usage examples for org.apache.hadoop.util LineReader LineReader
public LineReader(InputStream in, Configuration conf, byte[] recordDelimiterBytes) throws IOException
io.file.buffer.size
specified in the given Configuration
, and using a custom delimiter of array of bytes. From source file:hadoop.inputsplit.FastaLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); done = false;/*from w ww .j a va 2 s .c o m*/ this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); currentValue = new ValueWritable(); value = new Text(); tmpValue = new Text(); tmp = new Text(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR"); //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS)); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job, recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job, recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; setKeySeq(fs, job); //Set currentKey nextMyKeyValue(); //Leggo il primo record se esiste. }
From source file:hadoop.inputsplit.FastaLineRecordReader.java
License:Apache License
private void setKeySeq(FileSystem fs, Configuration job) { //Set currentKey if (Constant.SPLIT2_DEBUG_MODE) currentKey = new Text(file.getName() + "." + start); else {// w ww .j a va2 s .co m try { LineReader reader = new LineReader(fs.open(file), job, recordDelimiterBytes); currentKey = new Text(); reader.readLine(currentKey, maxLineLength); reader.close(); currentKey.set(currentKey.toString().replaceAll(">", "")); } catch (Exception e) { LOG.error(e.getMessage()); currentKey = new Text(file.getName()); } } }