Example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getPath

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getPath.

Prototype

public Path getPath() 

Source Link

Document

The file containing this split's data.

Usage

From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/* w w  w  .  ja  v  a 2  s. c  o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key?
    key = new Text(file.toString());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:co.cask.hydrator.plugin.batch.CopybookRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    // Get configuration
    Configuration conf = context.getConfiguration();
    int fileStructure = net.sf.JRecord.Common.Constants.IO_FIXED_LENGTH;
    Path path = new Path(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH));
    FileSystem fs = FileSystem.get(path.toUri(), conf);
    // Create input stream for the COBOL copybook contents
    InputStream inputStream = IOUtils
            .toInputStream(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_CBL_CONTENTS), "UTF-8");
    BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
    try {//from   ww  w  . jav  a  2 s . c o  m
        externalRecord = CopybookIOUtils.getExternalRecord(bufferedInputStream);
        recordByteLength = CopybookIOUtils.getRecordLength(externalRecord, fileStructure);

        LineProvider lineProvider = LineIOProvider.getInstance().getLineProvider(fileStructure,
                CopybookIOUtils.FONT);
        reader = LineIOProvider.getInstance().getLineReader(fileStructure, lineProvider);
        LayoutDetail copybook = CopybookIOUtils.getLayoutDetail(externalRecord);

        org.apache.hadoop.mapreduce.lib.input.FileSplit fileSplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) split;

        start = fileSplit.getStart();
        end = start + fileSplit.getLength();

        BufferedInputStream fileIn = new BufferedInputStream(fs.open(fileSplit.getPath()));
        // Jump to the point in the split at which the first complete record of the split starts,
        // if not the first InputSplit
        if (start != 0) {
            position = start - (start % recordByteLength) + recordByteLength;
            fileIn.skip(position);
        }
        reader.open(fileIn, copybook);

    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:co.cask.hydrator.plugin.batch.source.XMLRecordReader.java

License:Apache License

public XMLRecordReader(FileSplit split, Configuration conf) throws IOException {
    file = split.getPath();
    fileName = file.toUri().toString();/*from   w w  w. j  a va 2  s  .c  o m*/
    fs = file.getFileSystem(conf);
    XMLInputFactory factory = XMLInputFactory.newInstance();
    FSDataInputStream fdDataInputStream = fs.open(file);
    inputStream = new TrackingInputStream(fdDataInputStream);
    availableBytes = inputStream.available();
    try {
        reader = factory.createXMLStreamReader(inputStream);
    } catch (XMLStreamException exception) {
        throw new RuntimeException("XMLStreamException exception : ", exception);
    }
    //Set required node path details.
    String nodePath = conf.get(XMLInputFormat.XML_INPUTFORMAT_NODE_PATH);
    //Remove preceding '/' in node path to avoid first unwanted element after split('/')
    if (nodePath.indexOf("/") == 0) {
        nodePath = nodePath.substring(1, nodePath.length());
    }
    nodes = nodePath.split("/");

    currentNodeLevelMap = new HashMap<Integer, String>();

    tempFilePath = conf.get(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_DATA_TEMP_FOLDER);
    fileAction = conf.get(XMLInputFormat.XML_INPUTFORMAT_FILE_ACTION);
    targetFolder = conf.get(XMLInputFormat.XML_INPUTFORMAT_TARGET_FOLDER);
}

From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java

License:Apache License

/**
 * //from   w w  w .  ja  va  2s .  c o m
 * @param delimiter
 * @param column
 * 
 * 
 */

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF);
    this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0);
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:co.nubetech.hiho.merge.MergeKeyMapper.java

License:Apache License

@Override
protected void setup(Mapper.Context context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    InputSplit is = context.getInputSplit();
    FileSplit fs = (FileSplit) is;
    Path splitPath = fs.getPath();

    if (splitPath.toString().contains(conf.get(HIHOConf.MERGE_OLD_PATH))) {
        isOld = true;/*w  ww.  ja  va 2s .co  m*/
    } else if (splitPath.toString().contains(conf.get(HIHOConf.MERGE_NEW_PATH))) {
        isOld = false;
    } else {
        throw new IOException("File " + splitPath + " is not under new path" + conf.get(HIHOConf.MERGE_NEW_PATH)
                + " and old path" + conf.get(HIHOConf.MERGE_OLD_PATH));
    }
}

From source file:com.ashishpaliwal.hadoop.utils.inputformat.CsvRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;

    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, 2147483647);
    this.start = split.getStart();
    this.end = (this.start + split.getLength());
    Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    this.codec = this.compressionCodecs.getCodec(file);

    FileSystem fs = file.getFileSystem(job);
    this.fileIn = fs.open(file);
    if (isCompressedInput()) {
        this.decompressor = CodecPool.getDecompressor(this.codec);
        if ((this.codec instanceof SplittableCompressionCodec)) {
            SplitCompressionInputStream cIn = ((SplittableCompressionCodec) this.codec).createInputStream(
                    this.fileIn, this.decompressor, this.start, this.end,
                    SplittableCompressionCodec.READ_MODE.BYBLOCK);

            this.in = new CsvLineReader(cIn, job);
            this.start = cIn.getAdjustedStart();
            this.end = cIn.getAdjustedEnd();
            this.filePosition = cIn;
        } else {//from w  w w .ja  v a  2 s . c  o m
            this.in = new CsvLineReader(this.codec.createInputStream(this.fileIn, this.decompressor), job);
            this.filePosition = this.fileIn;
        }
    } else {
        this.fileIn.seek(this.start);
        this.in = new CsvLineReader(this.fileIn, job);
        this.filePosition = this.fileIn;
    }

    if (this.start != 0L) {
        this.start += this.in.readLine(new Text(), 0, maxBytesToConsume(this.start));
    }
    this.pos = this.start;
}

From source file:com.awcoleman.ExampleJobSummaryLogWithOutput.BinRecRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit insplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();

    FileSplit split = (FileSplit) insplit;

    start = split.getStart();//from   w w  w.j  a  va2s  .  c  om
    end = start + split.getLength();
    pos = start;

    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);
    fsin = fs.open(path);
}

From source file:com.bigdata.mapreduce.seqtotext.beta.ZipFileRecordReader.java

License:Apache License

/**
 * Initialise and open the ZIP file from the FileSystem
 *//*from ww  w  . ja  va 2  s . co m*/
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
        throws IOException, InterruptedException {
    //      Configuration conf = new Configuration();
    //      conf.set("fs.defaultFS", "hdfs://training.musigma.com:8020/user/musigma/");
    FileSplit split = (FileSplit) inputSplit;
    System.out.println("the task attempt instance is : " + taskAttemptContext.getJobName());
    System.out.println("the task attempt instance is : " + taskAttemptContext.getWorkingDirectory().toString());
    Configuration conf = taskAttemptContext.getConfiguration();
    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);
    System.out.println("file system replication : " + fs.getDefaultReplication());
    // Open the stream
    fsin = fs.open(path);
    // zip = new ZipInputStream(fsin);
    tar = new TarInputStream(fsin);
    System.out.println("tar input stream is : " + tar.toString());
}

From source file:com.bigdata.mapreduce.seqtotext.beta1.ZipFileRecordReader.java

License:Apache License

public ZipFileRecordReader(Configuration conf, org.apache.hadoop.mapred.FileSplit split)
        throws IOException, InterruptedException {
    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);
    System.out.println("file system replication : " + fs.getDefaultReplication());
    // Open the stream
    fsin = fs.open(path);/*  w  w  w .  ja  va 2  s. c  om*/
    // zip = new ZipInputStream(fsin);
    tar = new TarInputStream(fsin);
    System.out.println("tar input stream is : " + tar.toString());
}

From source file:com.blackberry.logdriver.mapreduce.boom.BoomIndividualRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;

    LOG.info("Initializing {}:{}+{}",
            new Object[] { fileSplit.getPath(), fileSplit.getStart(), fileSplit.getLength() });

    // Check for zero length files
    if (fileSplit.getPath().getFileSystem(context.getConfiguration()).getFileStatus(fileSplit.getPath())
            .getLen() == 0) {/*from  w w  w.j  av a2  s . co  m*/
        reader = null;
        return;
    }

    GenericDatumReader<Record> datumReader = new GenericDatumReader<Record>(Schemas.getSchema("logBlock"));
    reader = new DataFileReader<Record>(new FsInput(fileSplit.getPath(), context.getConfiguration()),
            datumReader);
    datumReader.setExpected(Schemas.getSchema("logBlock"));
    datumReader.setSchema(reader.getSchema());

    long size = fileSplit.getLength();
    start = fileSplit.getStart();
    end = start + size;

    reader.sync(start);
}