List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getPath
public Path getPath()
From source file:com.blm.orc.OrcNewInputFormat.java
License:Apache License
@Override public RecordReader<NullWritable, OrcStruct> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims().getConfiguration(context); return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); }
From source file:com.bonc.mr_roamRecognition_hjpt.comm.PathRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/*from www . j av a2s .co m*/ end = start + split.getLength(); final Path file = split.getPath(); path = split.getPath().toString(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ci.backports.avro.mapreduce.AvroRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; // Create a DataFileReader that can read records out of the input split. SeekableInput fileInput = new FsInput(fileSplit.getPath(), context.getConfiguration()); // We have to get the schema to read the records, which we assume was set in // the job's configuration. Schema schema = AvroJob.getInputSchema(context.getConfiguration()); reader = new DataFileReader<T>(fileInput, new SpecificDatumReader<T>(schema)); // Initialize the start and end offsets into the input file. // The reader syncs to the first record after the start of the input split. reader.sync(fileSplit.getStart());// w w w . ja v a 2 s. c om start = reader.previousSync(); end = fileSplit.getStart() + fileSplit.getLength(); currentRecord = new AvroKey<T>(null); }
From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java
License:Apache License
/** * Decide the start of the reader.// w w w .j a va2 s. co m */ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // if (codec instanceof CryptoCodec && job instanceof JobConf) // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec, // (JobConf) job, file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } LOG.info("Read from " + split.getPath().toString()); // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); // Read another line as previous. Text current = new Text(); int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start)); LOG.info("Skip line " + previous + " for last split."); start += newSize; // Keep reading until a splitable point is found. while (start <= end) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (canSplit(previous.getBytes(), current.getBytes())) { break; } start += newSize; previous.set(current.getBytes()); LOG.info("Skip line " + previous + " for last split."); } // If exceed the end, still read one extra line. if (start > end) { if (isContinue) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (!canSplit(previous.getBytes(), current.getBytes())) { // Still not splitable. So skip the block. start += newSize; isContinue = false; } } } LOG.info("Split between: \n" + previous + "\n" + current); // Restart at the last read line. fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } this.pos = start; } else { Text skip = new Text(); start += in.readLine(skip, maxLineLength, maxBytesToConsume(start)); // start += in.readLine(skip, 0, maxBytesToConsume(start)); LOG.info("Skip line " + skip + ". Start at " + start); } // Restart at the start index. }
From source file:com.cloudera.ByteBufferRecordReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.context = context; final Path file = split.getPath(); initialize(job, split.getStart(), split.getLength(), file); }
From source file:com.cloudera.crunch.type.avro.AvroRecordReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration conf = context.getConfiguration(); SeekableInput in = new FsInput(split.getPath(), conf); DatumReader<T> datumReader = new GenericDatumReader<T>(); this.reader = DataFileReader.openReader(in, datumReader); reader.sync(split.getStart()); // sync to start this.start = reader.tell(); this.end = split.getStart() + split.getLength(); }
From source file:com.cloudera.fts.spark.format.RawFileRecordReader.java
License:Apache License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); FileSplit split = (FileSplit) inputSplit; Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); fileIn = fs.open(path);/*from w w w .java 2 s . c o m*/ key = new Text(path.toString()); finished = false; }
From source file:com.cloudera.sa.ExcelRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; Configuration conf = context.getConfiguration(); Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); this.in = fs.open(file); XSSFWorkbook workbook = new XSSFWorkbook(this.in); XSSFSheet sheet = workbook.getSheetAt(0); this.totalRows = sheet.getPhysicalNumberOfRows(); this.processedRows = 0; this.rowIterator = sheet.rowIterator(); }
From source file:com.cloudera.sqoop.mapreduce.MergeMapperBase.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); keyColName = conf.get(MergeJob.MERGE_KEY_COL_KEY); InputSplit is = context.getInputSplit(); FileSplit fs = (FileSplit) is; Path splitPath = fs.getPath(); if (splitPath.toString().startsWith(conf.get(MergeJob.MERGE_NEW_PATH_KEY))) { this.isNew = true; } else if (splitPath.toString().startsWith(conf.get(MergeJob.MERGE_OLD_PATH_KEY))) { this.isNew = false; } else {//w w w.j a v a2 s .c o m throw new IOException( "File " + splitPath + " is not under new path " + conf.get(MergeJob.MERGE_NEW_PATH_KEY) + " or old path " + conf.get(MergeJob.MERGE_OLD_PATH_KEY)); } }
From source file:com.cotdp.hadoop.BrotliFileRecordReader.java
License:Apache License
/** * Initialize and open the ZIP file from the FileSystem */// w w w. j a v a 2 s. c om @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration conf = taskAttemptContext.getConfiguration(); Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); // Set the file path as the key currentKey.set(path.getName()); // Open the stream fsin = fs.open(path); String cmd = "/bin/cat"; ProcessBuilder pb = new ProcessBuilder(); pb.redirectOutput(); pb.command(cmd); try { decompressor = pb.start(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }