Example usage for org.apache.hadoop.util LineReader LineReader

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader LineReader.

Prototype

public LineReader(InputStream in, byte[] recordDelimiterBytes)

Source Link

Document

Create a line reader that reads from the given stream using the default buffer-size, and using a custom delimiter of array of bytes.

Usage

From source file:authordetect.input.SingleBookReader.java

/**
 * @param inputSplit/*from  w  w w . j  a v a2  s  . co m*/
 * @param context    the information about the task
 * @throws java.io.IOException
 * @throws InterruptedException
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) inputSplit;
    Configuration configuration = context.getConfiguration();

    // get the option from configuration:
    // 0 for group by author, 1 for group by book
    int option = configuration.getInt("GROUP_OPTION", 0);

    Path path = split.getPath();
    filename = path.getName();
    FileSystem fileSystem = path.getFileSystem(configuration);
    FSDataInputStream inputStream = fileSystem.open(path);
    lineReader = new LineReader(inputStream, configuration);

    //initial start point and end point
    start = split.getStart();
    end = start + split.getLength();

    inputStream.seek(start);
    if (start != 0) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }

    start += lineReader.readLine(currentLine);

    prepareToScanBook(option);
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the different file splits for the data based on a given number of splits
 * //from  w w w  .  j a  v a 2  s .  c om
 * @param status
 *          file status
 * @param conf
 *          hadoop configuration object
 * @param numberOfSplits
 *          number of splits to split the data in
 * @return list of file splits
 * @throws IOException
 *           thrown if the file does not exist
 */
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits)
        throws IOException {
    List<FileSplit> splits = newArrayList();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }
    long totalNumberOfLines = getTotalNumberOfLines(conf, fileName);
    int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits);
    LineReader lr = null;
    FSDataInputStream in = null;
    try {
        in = fileName.getFileSystem(conf).open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                splits.add(createFileSplit(fileName, begin, length));
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(createFileSplit(fileName, begin, length));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
        if (in != null) {
            in.close();
        }
    }
    return splits;
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned.
 * //from   ww  w.j av  a  2  s . co m
 * @param conf
 *          hadoop configuration object
 * @param fileName
 *          name of file to count
 * @return the number of lines in the file
 * @throws IOException
 */
public static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException {
    long nrLines = conf.getLong(NUMBER_OF_LINES_KEY, -1);
    if (nrLines != -1) {
        return nrLines;
    }

    try {
        FSDataInputStream in = fileName.getFileSystem(conf).open(fileName);
        LineReader lr = new LineReader(in, conf);
        Text text = new Text();
        nrLines = 0;
        while (lr.readLine(text) > 0) {
            nrLines++;
        }
        in.close();
        return nrLines;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return 0;
}

From source file:bucket_sort.NLineInputFormat.java

License:Apache License

public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit)
        throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit>();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }/* ww  w .  ja  v  a 2  s . c o  m*/
    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                // NLineInputFormat uses LineRecordReader, which always reads
                // (and consumes) at least one character out of its upper split
                // boundary. So to make sure that each mapper gets N lines, we
                // move back the upper split limits of each split 
                // by one character here.
                if (begin == 0) {
                    splits.add(new FileSplit(fileName, begin, length - 1, new String[] {}));
                } else {
                    splits.add(new FileSplit(fileName, begin - 1, length, new String[] {}));
                }
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(new FileSplit(fileName, begin, length, new String[] {}));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
    }
    return splits;
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/* w w  w  .  ja  v a 2  s.co  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key??
    key = new Text(file.getName());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from   w w w . j a  v  a 2s .c  o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key?
    key = new Text(file.toString());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java

License:Apache License

/**
 * /*from w w w .  ja v a 2s  .c  om*/
 * @param delimiter
 * @param column
 * 
 * 
 */

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF);
    this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0);
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java

License:Apache License

/**
 * Decide the start of the reader.//  ww w.j av  a  2 s.c  om
 */
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // if (codec instanceof CryptoCodec && job instanceof JobConf)
    // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec,
    // (JobConf) job, file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
        filePosition = fileIn;
    }
    LOG.info("Read from " + split.getPath().toString());
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));

        // Read another line as previous.

        Text current = new Text();

        int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start));

        LOG.info("Skip line " + previous + " for last split.");

        start += newSize;

        // Keep reading until a splitable point is found.
        while (start <= end) {
            newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
            if (canSplit(previous.getBytes(), current.getBytes())) {
                break;
            }
            start += newSize;
            previous.set(current.getBytes());
            LOG.info("Skip line " + previous + " for last split.");
        }

        // If exceed the end, still read one extra line.
        if (start > end) {
            if (isContinue) {
                newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
                if (!canSplit(previous.getBytes(), current.getBytes())) {
                    // Still not splitable. So skip the block.
                    start += newSize;
                    isContinue = false;
                }
            }
        }
        LOG.info("Split between: \n" + previous + "\n" + current);

        // Restart at the last read line.
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        this.pos = start;
    } else {
        Text skip = new Text();
        start += in.readLine(skip, maxLineLength, maxBytesToConsume(start));
        // start += in.readLine(skip, 0, maxBytesToConsume(start));
        LOG.info("Skip line " + skip + ". Start at " + start);
    }

    // Restart at the start index.
}

From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart());
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*  w w  w.  ja v a 2 s.  c  om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    key = new Text(FileToCDRType.getTypeByPath(file.getName()));
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java

License:Open Source License

public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException {
    LOG.warn("split start: " + split.getStart());
    LOG.warn("split length: " + split.getLength());
    String[] locs = split.getLocations();
    for (String loc : locs) {
        LOG.warn("location: " + loc);
    }/*from   w ww .ja v  a  2s . com*/
    start = split.getStart();
    end = start + split.getLength();
    LOG.warn("split end: " + end);
    final Path file = split.getPath();
    LOG.warn("file: " + file.getName());
    LOG.warn("INT split start: " + (int) split.getStart());
    LOG.warn("INT split length: " + (int) split.getLength());
    LOG.warn("INT split end: " + (int) end);

    FileSystem fs = file.getFileSystem(conf);
    codecFactory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = codecFactory.getCodec(file);
    LOG.warn("codec: " + codec.toString());
    LOG.warn("config: " + conf.toString());
    if (codec == null) {
        throw new IOException("No LZO codec found, cannot run.");
    }

    // Open the file and seek to the next split.
    fileIn = fs.open(file);
    // Create input stream and read the file header.
    in = new LineReader(codec.createInputStream(fileIn), conf);
    if (start != 0) {
        fileIn.seek(start);
        LOG.warn("fileIn position: " + fileIn.getPos());
        LOG.warn("buffer size: " + conf.get("io.file.buffer.size"));

        // Read and ignore the first line.
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    pos = start;
}