Example usage for org.apache.hadoop.util LineReader LineReader

List of usage examples for org.apache.hadoop.util LineReader LineReader

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader LineReader.

Prototype

public LineReader(InputStream in, Configuration conf, byte[] recordDelimiterBytes) throws IOException 

Source Link

Document

Create a line reader that reads from the given stream using the io.file.buffer.size specified in the given Configuration, and using a custom delimiter of array of bytes.

Usage

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    done = false;/*from w  ww  .j a  va 2  s .c  o  m*/

    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();

    file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    currentValue = new ValueWritable();
    value = new Text();
    tmpValue = new Text();
    tmp = new Text();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR");
    //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS));

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job, recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job, recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;

    setKeySeq(fs, job); //Set currentKey

    nextMyKeyValue(); //Leggo il primo record se esiste.

}

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

private void setKeySeq(FileSystem fs, Configuration job) { //Set currentKey

    if (Constant.SPLIT2_DEBUG_MODE)
        currentKey = new Text(file.getName() + "." + start);
    else {// w ww .j  a va2  s  .co  m
        try {
            LineReader reader = new LineReader(fs.open(file), job, recordDelimiterBytes);
            currentKey = new Text();
            reader.readLine(currentKey, maxLineLength);
            reader.close();
            currentKey.set(currentKey.toString().replaceAll(">", ""));
        } catch (Exception e) {
            LOG.error(e.getMessage());
            currentKey = new Text(file.getName());
        }
    }

}