List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getPath
public Path getPath()
From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/* w w w . ja v a 2 s. c o m*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key? key = new Text(file.toString()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:co.cask.hydrator.plugin.batch.CopybookRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Get configuration Configuration conf = context.getConfiguration(); int fileStructure = net.sf.JRecord.Common.Constants.IO_FIXED_LENGTH; Path path = new Path(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH)); FileSystem fs = FileSystem.get(path.toUri(), conf); // Create input stream for the COBOL copybook contents InputStream inputStream = IOUtils .toInputStream(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_CBL_CONTENTS), "UTF-8"); BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream); try {//from ww w . jav a 2 s . c o m externalRecord = CopybookIOUtils.getExternalRecord(bufferedInputStream); recordByteLength = CopybookIOUtils.getRecordLength(externalRecord, fileStructure); LineProvider lineProvider = LineIOProvider.getInstance().getLineProvider(fileStructure, CopybookIOUtils.FONT); reader = LineIOProvider.getInstance().getLineReader(fileStructure, lineProvider); LayoutDetail copybook = CopybookIOUtils.getLayoutDetail(externalRecord); org.apache.hadoop.mapreduce.lib.input.FileSplit fileSplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) split; start = fileSplit.getStart(); end = start + fileSplit.getLength(); BufferedInputStream fileIn = new BufferedInputStream(fs.open(fileSplit.getPath())); // Jump to the point in the split at which the first complete record of the split starts, // if not the first InputSplit if (start != 0) { position = start - (start % recordByteLength) + recordByteLength; fileIn.skip(position); } reader.open(fileIn, copybook); } catch (Exception e) { throw new RuntimeException(e); } }
From source file:co.cask.hydrator.plugin.batch.source.XMLRecordReader.java
License:Apache License
public XMLRecordReader(FileSplit split, Configuration conf) throws IOException { file = split.getPath(); fileName = file.toUri().toString();/*from w w w. j a va 2 s .c o m*/ fs = file.getFileSystem(conf); XMLInputFactory factory = XMLInputFactory.newInstance(); FSDataInputStream fdDataInputStream = fs.open(file); inputStream = new TrackingInputStream(fdDataInputStream); availableBytes = inputStream.available(); try { reader = factory.createXMLStreamReader(inputStream); } catch (XMLStreamException exception) { throw new RuntimeException("XMLStreamException exception : ", exception); } //Set required node path details. String nodePath = conf.get(XMLInputFormat.XML_INPUTFORMAT_NODE_PATH); //Remove preceding '/' in node path to avoid first unwanted element after split('/') if (nodePath.indexOf("/") == 0) { nodePath = nodePath.substring(1, nodePath.length()); } nodes = nodePath.split("/"); currentNodeLevelMap = new HashMap<Integer, String>(); tempFilePath = conf.get(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_DATA_TEMP_FOLDER); fileAction = conf.get(XMLInputFormat.XML_INPUTFORMAT_FILE_ACTION); targetFolder = conf.get(XMLInputFormat.XML_INPUTFORMAT_TARGET_FOLDER); }
From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java
License:Apache License
/** * //from w w w . ja va 2s . c o m * @param delimiter * @param column * * */ @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF); this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:co.nubetech.hiho.merge.MergeKeyMapper.java
License:Apache License
@Override protected void setup(Mapper.Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); InputSplit is = context.getInputSplit(); FileSplit fs = (FileSplit) is; Path splitPath = fs.getPath(); if (splitPath.toString().contains(conf.get(HIHOConf.MERGE_OLD_PATH))) { isOld = true;/*w ww. ja va 2s .co m*/ } else if (splitPath.toString().contains(conf.get(HIHOConf.MERGE_NEW_PATH))) { isOld = false; } else { throw new IOException("File " + splitPath + " is not under new path" + conf.get(HIHOConf.MERGE_NEW_PATH) + " and old path" + conf.get(HIHOConf.MERGE_OLD_PATH)); } }
From source file:com.ashishpaliwal.hadoop.utils.inputformat.CsvRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, 2147483647); this.start = split.getStart(); this.end = (this.start + split.getLength()); Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); this.codec = this.compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(job); this.fileIn = fs.open(file); if (isCompressedInput()) { this.decompressor = CodecPool.getDecompressor(this.codec); if ((this.codec instanceof SplittableCompressionCodec)) { SplitCompressionInputStream cIn = ((SplittableCompressionCodec) this.codec).createInputStream( this.fileIn, this.decompressor, this.start, this.end, SplittableCompressionCodec.READ_MODE.BYBLOCK); this.in = new CsvLineReader(cIn, job); this.start = cIn.getAdjustedStart(); this.end = cIn.getAdjustedEnd(); this.filePosition = cIn; } else {//from w w w .ja v a 2 s . c o m this.in = new CsvLineReader(this.codec.createInputStream(this.fileIn, this.decompressor), job); this.filePosition = this.fileIn; } } else { this.fileIn.seek(this.start); this.in = new CsvLineReader(this.fileIn, job); this.filePosition = this.fileIn; } if (this.start != 0L) { this.start += this.in.readLine(new Text(), 0, maxBytesToConsume(this.start)); } this.pos = this.start; }
From source file:com.awcoleman.ExampleJobSummaryLogWithOutput.BinRecRecordReader.java
License:Apache License
@Override public void initialize(InputSplit insplit, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); FileSplit split = (FileSplit) insplit; start = split.getStart();//from w w w.j a va2s . c om end = start + split.getLength(); pos = start; Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); fsin = fs.open(path); }
From source file:com.bigdata.mapreduce.seqtotext.beta.ZipFileRecordReader.java
License:Apache License
/** * Initialise and open the ZIP file from the FileSystem *//*from ww w . ja va 2 s . co m*/ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { // Configuration conf = new Configuration(); // conf.set("fs.defaultFS", "hdfs://training.musigma.com:8020/user/musigma/"); FileSplit split = (FileSplit) inputSplit; System.out.println("the task attempt instance is : " + taskAttemptContext.getJobName()); System.out.println("the task attempt instance is : " + taskAttemptContext.getWorkingDirectory().toString()); Configuration conf = taskAttemptContext.getConfiguration(); Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); System.out.println("file system replication : " + fs.getDefaultReplication()); // Open the stream fsin = fs.open(path); // zip = new ZipInputStream(fsin); tar = new TarInputStream(fsin); System.out.println("tar input stream is : " + tar.toString()); }
From source file:com.bigdata.mapreduce.seqtotext.beta1.ZipFileRecordReader.java
License:Apache License
public ZipFileRecordReader(Configuration conf, org.apache.hadoop.mapred.FileSplit split) throws IOException, InterruptedException { Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); System.out.println("file system replication : " + fs.getDefaultReplication()); // Open the stream fsin = fs.open(path);/* w w w . ja va 2 s. c om*/ // zip = new ZipInputStream(fsin); tar = new TarInputStream(fsin); System.out.println("tar input stream is : " + tar.toString()); }
From source file:com.blackberry.logdriver.mapreduce.boom.BoomIndividualRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; LOG.info("Initializing {}:{}+{}", new Object[] { fileSplit.getPath(), fileSplit.getStart(), fileSplit.getLength() }); // Check for zero length files if (fileSplit.getPath().getFileSystem(context.getConfiguration()).getFileStatus(fileSplit.getPath()) .getLen() == 0) {/*from w w w.j av a2 s . co m*/ reader = null; return; } GenericDatumReader<Record> datumReader = new GenericDatumReader<Record>(Schemas.getSchema("logBlock")); reader = new DataFileReader<Record>(new FsInput(fileSplit.getPath(), context.getConfiguration()), datumReader); datumReader.setExpected(Schemas.getSchema("logBlock")); datumReader.setSchema(reader.getSchema()); long size = fileSplit.getLength(); start = fileSplit.getStart(); end = start + size; reader.sync(start); }