Example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getPath

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getPath.

Prototype

public Path getPath() 

Source Link

Document

The file containing this split's data.

Usage

From source file:com.blm.orc.OrcNewInputFormat.java

License:Apache License

@Override
public RecordReader<NullWritable, OrcStruct> createRecordReader(InputSplit inputSplit,
        TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) inputSplit;
    Path path = fileSplit.getPath();
    Configuration conf = ShimLoader.getHadoopShims().getConfiguration(context);
    return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)),
            ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength());
}

From source file:com.bonc.mr_roamRecognition_hjpt.comm.PathRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from www  . j  av a2s  .co m*/
    end = start + split.getLength();
    final Path file = split.getPath();

    path = split.getPath().toString();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job,
                    this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ci.backports.avro.mapreduce.AvroRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    // Create a DataFileReader that can read records out of the input split.
    SeekableInput fileInput = new FsInput(fileSplit.getPath(), context.getConfiguration());
    // We have to get the schema to read the records, which we assume was set in
    // the job's configuration.
    Schema schema = AvroJob.getInputSchema(context.getConfiguration());
    reader = new DataFileReader<T>(fileInput, new SpecificDatumReader<T>(schema));

    // Initialize the start and end offsets into the input file.
    // The reader syncs to the first record after the start of the input split.
    reader.sync(fileSplit.getStart());// w  w  w .  ja v a  2  s. c  om
    start = reader.previousSync();
    end = fileSplit.getStart() + fileSplit.getLength();

    currentRecord = new AvroKey<T>(null);
}

From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java

License:Apache License

/**
 * Decide the start of the reader.//  w  w  w .j  a va2  s. co  m
 */
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // if (codec instanceof CryptoCodec && job instanceof JobConf)
    // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec,
    // (JobConf) job, file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
        filePosition = fileIn;
    }
    LOG.info("Read from " + split.getPath().toString());
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));

        // Read another line as previous.

        Text current = new Text();

        int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start));

        LOG.info("Skip line " + previous + " for last split.");

        start += newSize;

        // Keep reading until a splitable point is found.
        while (start <= end) {
            newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
            if (canSplit(previous.getBytes(), current.getBytes())) {
                break;
            }
            start += newSize;
            previous.set(current.getBytes());
            LOG.info("Skip line " + previous + " for last split.");
        }

        // If exceed the end, still read one extra line.
        if (start > end) {
            if (isContinue) {
                newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
                if (!canSplit(previous.getBytes(), current.getBytes())) {
                    // Still not splitable. So skip the block.
                    start += newSize;
                    isContinue = false;
                }
            }
        }
        LOG.info("Split between: \n" + previous + "\n" + current);

        // Restart at the last read line.
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        this.pos = start;
    } else {
        Text skip = new Text();
        start += in.readLine(skip, maxLineLength, maxBytesToConsume(start));
        // start += in.readLine(skip, 0, maxBytesToConsume(start));
        LOG.info("Skip line " + skip + ". Start at " + start);
    }

    // Restart at the start index.
}

From source file:com.cloudera.ByteBufferRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.context = context;
    final Path file = split.getPath();
    initialize(job, split.getStart(), split.getLength(), file);
}

From source file:com.cloudera.crunch.type.avro.AvroRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration conf = context.getConfiguration();
    SeekableInput in = new FsInput(split.getPath(), conf);
    DatumReader<T> datumReader = new GenericDatumReader<T>();
    this.reader = DataFileReader.openReader(in, datumReader);
    reader.sync(split.getStart()); // sync to start
    this.start = reader.tell();
    this.end = split.getStart() + split.getLength();
}

From source file:com.cloudera.fts.spark.format.RawFileRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    FileSplit split = (FileSplit) inputSplit;
    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);
    fileIn = fs.open(path);/*from  w w  w .java 2  s .  c  o  m*/
    key = new Text(path.toString());
    finished = false;
}

From source file:com.cloudera.sa.ExcelRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    Configuration conf = context.getConfiguration();
    Path file = fileSplit.getPath();
    FileSystem fs = file.getFileSystem(conf);
    this.in = fs.open(file);
    XSSFWorkbook workbook = new XSSFWorkbook(this.in);
    XSSFSheet sheet = workbook.getSheetAt(0);
    this.totalRows = sheet.getPhysicalNumberOfRows();
    this.processedRows = 0;
    this.rowIterator = sheet.rowIterator();
}

From source file:com.cloudera.sqoop.mapreduce.MergeMapperBase.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    keyColName = conf.get(MergeJob.MERGE_KEY_COL_KEY);

    InputSplit is = context.getInputSplit();
    FileSplit fs = (FileSplit) is;
    Path splitPath = fs.getPath();

    if (splitPath.toString().startsWith(conf.get(MergeJob.MERGE_NEW_PATH_KEY))) {
        this.isNew = true;
    } else if (splitPath.toString().startsWith(conf.get(MergeJob.MERGE_OLD_PATH_KEY))) {
        this.isNew = false;
    } else {//w  w  w.j  a v  a2 s .c  o m
        throw new IOException(
                "File " + splitPath + " is not under new path " + conf.get(MergeJob.MERGE_NEW_PATH_KEY)
                        + " or old path " + conf.get(MergeJob.MERGE_OLD_PATH_KEY));
    }
}

From source file:com.cotdp.hadoop.BrotliFileRecordReader.java

License:Apache License

/**
 * Initialize and open the ZIP file from the FileSystem
 */// w w w.  j a v  a 2  s. c om
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) inputSplit;
    Configuration conf = taskAttemptContext.getConfiguration();
    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);

    // Set the file path as the key
    currentKey.set(path.getName());
    // Open the stream
    fsin = fs.open(path);

    String cmd = "/bin/cat";
    ProcessBuilder pb = new ProcessBuilder();
    pb.redirectOutput();
    pb.command(cmd);

    try {
        decompressor = pb.start();

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}