Example usage for org.apache.hadoop.mapred FileSplit getPath

List of usage examples for org.apache.hadoop.mapred FileSplit getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileSplit getPath.

Prototype

public Path getPath() 

Source Link

Document

The file containing this split's data.

Usage

From source file:DeprecatedBAMInputFormat.java

License:Open Source License

public static List<org.apache.hadoop.mapreduce.InputSplit> undeprecateSplits(InputSplit[] splits)
        throws IOException {
    final List<org.apache.hadoop.mapreduce.InputSplit> undeprecated = new ArrayList<org.apache.hadoop.mapreduce.InputSplit>(
            splits.length);/*from  w w  w  .  j  a v a2 s.  c  o  m*/
    for (final InputSplit s : splits) {
        final FileSplit f = (FileSplit) s;
        undeprecated.add(new org.apache.hadoop.mapreduce.lib.input.FileSplit(f.getPath(), f.getStart(),
                f.getLength(), f.getLocations()));
    }
    return undeprecated;
}

From source file:StreamWikiDumpInputFormat.java

License:Apache License

public RecordReader<Text, Text> getRecordReader(final InputSplit genericSplit, JobConf job, Reporter reporter)
        throws IOException {
    // handling non-standard record reader (likely StreamXmlRecordReader)
    FileSplit split = (FileSplit) genericSplit;
    LOG.info("getRecordReader start.....split=" + split);
    reporter.setStatus(split.toString());

    // Open the file and seek to the start of the split
    FileSystem fs = split.getPath().getFileSystem(job);
    String patt = job.get(KEY_EXCLUDE_PAGE_PATTERN);
    boolean prev = job.getBoolean(KEY_PREVIOUS_REVISION, true);
    return new MyRecordReader(split, reporter, job, fs,
            patt != null && !"".equals(patt) ? Pattern.compile(patt) : null, prev);
}

From source file:DeprecatedBAMRecordReader.java

License:Open Source License

public DeprecatedBAMRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException {
    if (split instanceof DeprecatedFileVirtualSplit) {
        rr.initialize(((DeprecatedFileVirtualSplit) split).vs, new FakeTaskAttemptContext(job));

        splitLength = split.getLength();
        return;//  w  ww . j a  v  a  2s .c o  m

    }
    if (split instanceof FileSplit) {
        // XXX             XXX
        //     XXX     XXX
        //         XXX
        //     XXX     XXX
        // XXX             XXX
        //
        // Hive gives us its own custom FileSplits for some reason, so we have
        // to do our own split alignment. (Sometimes, anyway; for "select
        // count(*) from table" we get FileSplits here, but for "select * from
        // table" our input format is used directly. Perhaps it's only because
        // the latter doesn't spawn a MapReduce job, so getting a FileSplit
        // here is the common case.)
        //
        // Since we get only one split at a time here, this is very poor: we
        // have to open the file for every split, even if it's the same file
        // every time.
        //
        // This should always work, but might be /very/ slow. I can't think of
        // a better way.

        final FileSplit fspl = (FileSplit) split;
        final Path path = fspl.getPath();

        final long beg = fspl.getStart();
        final long end = beg + fspl.getLength();

        final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(job), path);
        final BAMSplitGuesser guesser = new BAMSplitGuesser(sin);

        final long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);
        sin.close();

        if (alignedBeg == end)
            throw new IOException("Guesser found nothing after pos " + beg);

        final long alignedEnd = end << 16 | 0xffff;
        splitLength = (alignedEnd - alignedBeg) >> 16;

        rr.initialize(new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()),
                new FakeTaskAttemptContext(job));
        return;
    }

    throw new ClassCastException("Can only handle DeprecatedFileVirtualSplit and FileSplit");
}

From source file:SeekableInputStream.java

License:Apache License

public static SeekableInputStream getInstance(FileSplit split, FileSystem fs,
        CompressionCodecFactory compressionCodecs) throws IOException {
    return getInstance(split.getPath(), split.getStart(), split.getStart() + split.getLength(), fs,
            compressionCodecs);/*from   ww w  . j  a  v  a2s. c om*/
}

From source file:alluxio.hadoop.HadoopUtils.java

License:Apache License

/**
 * Returns a string representation of a Hadoop {@link FileSplit}.
 *
 * @param fs Hadoop {@link FileSplit}/*from  ww  w  . j a va 2  s.  c  om*/
 * @return its string representation
 */
public static String toStringHadoopFileSplit(FileSplit fs) {
    StringBuilder sb = new StringBuilder();
    sb.append("HadoopFileSplit: Path: ").append(fs.getPath());
    sb.append(" , Start: ").append(fs.getStart());
    sb.append(" , Length: ").append(fs.getLength());
    sb.append(" , Hosts: ");
    String[] locs;
    try {
        locs = fs.getLocations();
    } catch (IOException e) {
        LOG.error(e.getMessage());
        locs = new String[] {};
    }
    for (String loc : locs) {
        sb.append(loc).append("; ");
    }

    return sb.toString();
}

From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMap.java

License:Open Source License

public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
    String filename = fileSplit.getPath().getName();

    List<String> tokenizedLine = lineTokenizer.tokenize(value.toString());

    for (String auxWord : tokenizedLine) {
        output.collect(new Text(auxWord), new Text(filename));
    }//from   w  ww  . ja v a 2s .  co m
}

From source file:ca.sparkera.adapters.mapred.MainframeVBRecordReader.java

License:Apache License

public MainframeVBRecordReader(Configuration job, FileSplit split) throws IOException {
    reader = new ca.sparkera.adapters.mapreduce.MainframeVBRecordReader();
    reader.initialize(job, split.getStart(), split.getLength(), split.getPath());
}

From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java

License:Apache License

protected void initialize(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/*from   w  w  w .  j a v  a 2  s.co  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    String rowDelim = job.get("textinputformat.record.delimiter", null);
    if (codec != null) {
        if (rowDelim != null) {
            byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
            in = new DelimitedLineReader(codec.createInputStream(fileIn), job,
                    (hexcode != null) ? hexcode : rowDelim.getBytes());
        } else {
            in = new DelimitedLineReader(codec.createInputStream(fileIn), job);
        }
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        if (rowDelim != null) {
            byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
            in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes());
        } else {
            in = new DelimitedLineReader(fileIn, job);
        }
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java

License:Apache License

@Override
protected void initialize(Configuration job, FileSplit split) throws IOException {
    start = split.getStart();//from w w w .j ava 2 s  . c o m
    end = start + split.getLength();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("No codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    String rowDelim = job.get("textinputformat.record.delimiter", null);
    if (rowDelim != null) {
        byte[] hexcode = SortConfig.getHexDelimiter(rowDelim);
        in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes());
    } else {
        in = new DelimitedLineReader(codec.createInputStream(fileIn), job);
    }

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:com.aliyun.fs.oss.common.OssRecordReader.java

License:Apache License

public OssRecordReader(Configuration job, FileSplit split, FileSystem fs, byte[] recordDelimiter)
        throws IOException {
    this.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH,
            Integer.MAX_VALUE);/*from  www  . j ava  2 s  . c  om*/
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new LineReader(cIn, job, recordDelimiter);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiter);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job, recordDelimiter);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}