Example usage for org.apache.hadoop.util LineReader LineReader

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader LineReader.

Prototype

public LineReader(InputStream in, Configuration conf, byte[] recordDelimiterBytes) throws IOException

Source Link

Document

Create a line reader that reads from the given stream using the io.file.buffer.size specified in the given Configuration, and using a custom delimiter of array of bytes.

Usage

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();

    done = false;/*from w  ww  .j a  va 2  s .c  o  m*/

    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();

    file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    currentValue = new ValueWritable();
    value = new Text();
    tmpValue = new Text();
    tmp = new Text();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR");
    //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS));

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job, recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job, recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;

    setKeySeq(fs, job); //Set currentKey

    nextMyKeyValue(); //Leggo il primo record se esiste.

}

From source file:hadoop.inputsplit.FastaLineRecordReader.java

License:Apache License

private void setKeySeq(FileSystem fs, Configuration job) { //Set currentKey

    if (Constant.SPLIT2_DEBUG_MODE)
        currentKey = new Text(file.getName() + "." + start);
    else {// w ww .j  a va2  s  .co  m
        try {
            LineReader reader = new LineReader(fs.open(file), job, recordDelimiterBytes);
            currentKey = new Text();
            reader.readLine(currentKey, maxLineLength);
            reader.close();
            currentKey.set(currentKey.toString().replaceAll(">", ""));
        } catch (Exception e) {
            LOG.error(e.getMessage());
            currentKey = new Text(file.getName());
        }
    }

}