List of usage examples for org.apache.hadoop.util LineReader LineReader
public LineReader(InputStream in, byte[] recordDelimiterBytes)
From source file:authordetect.input.SingleBookReader.java
/** * @param inputSplit/*from w w w . j a v a2 s . co m*/ * @param context the information about the task * @throws java.io.IOException * @throws InterruptedException */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration configuration = context.getConfiguration(); // get the option from configuration: // 0 for group by author, 1 for group by book int option = configuration.getInt("GROUP_OPTION", 0); Path path = split.getPath(); filename = path.getName(); FileSystem fileSystem = path.getFileSystem(configuration); FSDataInputStream inputStream = fileSystem.open(path); lineReader = new LineReader(inputStream, configuration); //initial start point and end point start = split.getStart(); end = start + split.getLength(); inputStream.seek(start); if (start != 0) { start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } start += lineReader.readLine(currentLine); prepareToScanBook(option); }
From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java
License:Apache License
/** * Gets the different file splits for the data based on a given number of splits * //from w w w . j a v a 2 s . c om * @param status * file status * @param conf * hadoop configuration object * @param numberOfSplits * number of splits to split the data in * @return list of file splits * @throws IOException * thrown if the file does not exist */ public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits) throws IOException { List<FileSplit> splits = newArrayList(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); } long totalNumberOfLines = getTotalNumberOfLines(conf, fileName); int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits); LineReader lr = null; FSDataInputStream in = null; try { in = fileName.getFileSystem(conf).open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { splits.add(createFileSplit(fileName, begin, length)); begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(createFileSplit(fileName, begin, length)); } } finally { if (lr != null) { lr.close(); } if (in != null) { in.close(); } } return splits; }
From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java
License:Apache License
/** * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned. * //from ww w.j av a 2 s . co m * @param conf * hadoop configuration object * @param fileName * name of file to count * @return the number of lines in the file * @throws IOException */ public static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException { long nrLines = conf.getLong(NUMBER_OF_LINES_KEY, -1); if (nrLines != -1) { return nrLines; } try { FSDataInputStream in = fileName.getFileSystem(conf).open(fileName); LineReader lr = new LineReader(in, conf); Text text = new Text(); nrLines = 0; while (lr.readLine(text) > 0) { nrLines++; } in.close(); return nrLines; } catch (IOException e) { e.printStackTrace(); } return 0; }
From source file:bucket_sort.NLineInputFormat.java
License:Apache License
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException { List<FileSplit> splits = new ArrayList<FileSplit>(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); }/* ww w . ja v a 2 s . c o m*/ FileSystem fs = fileName.getFileSystem(conf); LineReader lr = null; try { FSDataInputStream in = fs.open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { // NLineInputFormat uses LineRecordReader, which always reads // (and consumes) at least one character out of its upper split // boundary. So to make sure that each mapper gets N lines, we // move back the upper split limits of each split // by one character here. if (begin == 0) { splits.add(new FileSplit(fileName, begin, length - 1, new String[] {})); } else { splits.add(new FileSplit(fileName, begin - 1, length, new String[] {})); } begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(new FileSplit(fileName, begin, length, new String[] {})); } } finally { if (lr != null) { lr.close(); } } return splits; }
From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/* w w w . ja v a 2 s.co m*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key?? key = new Text(file.getName()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/*from w w w . j a v a 2s .c o m*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key? key = new Text(file.toString()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java
License:Apache License
/** * /*from w w w . ja v a 2s .c om*/ * @param delimiter * @param column * * */ @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF); this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java
License:Apache License
/** * Decide the start of the reader.// ww w.j av a 2 s.c om */ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // if (codec instanceof CryptoCodec && job instanceof JobConf) // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec, // (JobConf) job, file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } LOG.info("Read from " + split.getPath().toString()); // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); // Read another line as previous. Text current = new Text(); int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start)); LOG.info("Skip line " + previous + " for last split."); start += newSize; // Keep reading until a splitable point is found. while (start <= end) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (canSplit(previous.getBytes(), current.getBytes())) { break; } start += newSize; previous.set(current.getBytes()); LOG.info("Skip line " + previous + " for last split."); } // If exceed the end, still read one extra line. if (start > end) { if (isContinue) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (!canSplit(previous.getBytes(), current.getBytes())) { // Still not splitable. So skip the block. start += newSize; isContinue = false; } } } LOG.info("Split between: \n" + previous + "\n" + current); // Restart at the last read line. fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } this.pos = start; } else { Text skip = new Text(); start += in.readLine(skip, maxLineLength, maxBytesToConsume(start)); // start += in.readLine(skip, 0, maxBytesToConsume(start)); LOG.info("Skip line " + skip + ". Start at " + start); } // Restart at the start index. }
From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart()); Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/* w w w. ja v a 2 s. c om*/ end = start + split.getLength(); final Path file = split.getPath(); key = new Text(FileToCDRType.getTypeByPath(file.getName())); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java
License:Open Source License
public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException { LOG.warn("split start: " + split.getStart()); LOG.warn("split length: " + split.getLength()); String[] locs = split.getLocations(); for (String loc : locs) { LOG.warn("location: " + loc); }/*from w ww .ja v a 2s . com*/ start = split.getStart(); end = start + split.getLength(); LOG.warn("split end: " + end); final Path file = split.getPath(); LOG.warn("file: " + file.getName()); LOG.warn("INT split start: " + (int) split.getStart()); LOG.warn("INT split length: " + (int) split.getLength()); LOG.warn("INT split end: " + (int) end); FileSystem fs = file.getFileSystem(conf); codecFactory = new CompressionCodecFactory(conf); final CompressionCodec codec = codecFactory.getCodec(file); LOG.warn("codec: " + codec.toString()); LOG.warn("config: " + conf.toString()); if (codec == null) { throw new IOException("No LZO codec found, cannot run."); } // Open the file and seek to the next split. fileIn = fs.open(file); // Create input stream and read the file header. in = new LineReader(codec.createInputStream(fileIn), conf); if (start != 0) { fileIn.seek(start); LOG.warn("fileIn position: " + fileIn.getPos()); LOG.warn("buffer size: " + conf.get("io.file.buffer.size")); // Read and ignore the first line. in.readLine(new Text()); start = fileIn.getPos(); } pos = start; }