Example usage for org.apache.hadoop.mapred FileSplit getLocations

List of usage examples for org.apache.hadoop.mapred FileSplit getLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileSplit getLocations.

Prototype

public String[] getLocations() throws IOException 

Source Link

Usage

From source file:DeprecatedBAMInputFormat.java

License:Open Source License

public static List<org.apache.hadoop.mapreduce.InputSplit> undeprecateSplits(InputSplit[] splits)
        throws IOException {
    final List<org.apache.hadoop.mapreduce.InputSplit> undeprecated = new ArrayList<org.apache.hadoop.mapreduce.InputSplit>(
            splits.length);// ww w.ja va 2 s  . com
    for (final InputSplit s : splits) {
        final FileSplit f = (FileSplit) s;
        undeprecated.add(new org.apache.hadoop.mapreduce.lib.input.FileSplit(f.getPath(), f.getStart(),
                f.getLength(), f.getLocations()));
    }
    return undeprecated;
}

From source file:DeprecatedBAMRecordReader.java

License:Open Source License

public DeprecatedBAMRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException {
    if (split instanceof DeprecatedFileVirtualSplit) {
        rr.initialize(((DeprecatedFileVirtualSplit) split).vs, new FakeTaskAttemptContext(job));

        splitLength = split.getLength();
        return;//from  ww  w.  j  av  a  2s  .  c  o  m

    }
    if (split instanceof FileSplit) {
        // XXX             XXX
        //     XXX     XXX
        //         XXX
        //     XXX     XXX
        // XXX             XXX
        //
        // Hive gives us its own custom FileSplits for some reason, so we have
        // to do our own split alignment. (Sometimes, anyway; for "select
        // count(*) from table" we get FileSplits here, but for "select * from
        // table" our input format is used directly. Perhaps it's only because
        // the latter doesn't spawn a MapReduce job, so getting a FileSplit
        // here is the common case.)
        //
        // Since we get only one split at a time here, this is very poor: we
        // have to open the file for every split, even if it's the same file
        // every time.
        //
        // This should always work, but might be /very/ slow. I can't think of
        // a better way.

        final FileSplit fspl = (FileSplit) split;
        final Path path = fspl.getPath();

        final long beg = fspl.getStart();
        final long end = beg + fspl.getLength();

        final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(job), path);
        final BAMSplitGuesser guesser = new BAMSplitGuesser(sin);

        final long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);
        sin.close();

        if (alignedBeg == end)
            throw new IOException("Guesser found nothing after pos " + beg);

        final long alignedEnd = end << 16 | 0xffff;
        splitLength = (alignedEnd - alignedBeg) >> 16;

        rr.initialize(new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()),
                new FakeTaskAttemptContext(job));
        return;
    }

    throw new ClassCastException("Can only handle DeprecatedFileVirtualSplit and FileSplit");
}

From source file:alluxio.hadoop.HadoopUtils.java

License:Apache License

/**
 * Returns a string representation of a Hadoop {@link FileSplit}.
 *
 * @param fs Hadoop {@link FileSplit}//  w ww .j  ava 2  s  .c  om
 * @return its string representation
 */
public static String toStringHadoopFileSplit(FileSplit fs) {
    StringBuilder sb = new StringBuilder();
    sb.append("HadoopFileSplit: Path: ").append(fs.getPath());
    sb.append(" , Start: ").append(fs.getStart());
    sb.append(" , Length: ").append(fs.getLength());
    sb.append(" , Hosts: ");
    String[] locs;
    try {
        locs = fs.getLocations();
    } catch (IOException e) {
        LOG.error(e.getMessage());
        locs = new String[] {};
    }
    for (String loc : locs) {
        sb.append(loc).append("; ");
    }

    return sb.toString();
}

From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java

License:Open Source License

public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException {
    LOG.warn("split start: " + split.getStart());
    LOG.warn("split length: " + split.getLength());
    String[] locs = split.getLocations();
    for (String loc : locs) {
        LOG.warn("location: " + loc);
    }/*from   w w w  .j ava 2s. c  o  m*/
    start = split.getStart();
    end = start + split.getLength();
    LOG.warn("split end: " + end);
    final Path file = split.getPath();
    LOG.warn("file: " + file.getName());
    LOG.warn("INT split start: " + (int) split.getStart());
    LOG.warn("INT split length: " + (int) split.getLength());
    LOG.warn("INT split end: " + (int) end);

    FileSystem fs = file.getFileSystem(conf);
    codecFactory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = codecFactory.getCodec(file);
    LOG.warn("codec: " + codec.toString());
    LOG.warn("config: " + conf.toString());
    if (codec == null) {
        throw new IOException("No LZO codec found, cannot run.");
    }

    // Open the file and seek to the next split.
    fileIn = fs.open(file);
    // Create input stream and read the file header.
    in = new LineReader(codec.createInputStream(fileIn), conf);
    if (start != 0) {
        fileIn.seek(start);
        LOG.warn("fileIn position: " + fileIn.getPos());
        LOG.warn("buffer size: " + conf.get("io.file.buffer.size"));

        // Read and ignore the first line.
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    pos = start;
}

From source file:com.hadoop.mapred.DeprecatedLzoTextInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    FileSplit[] splits = (FileSplit[]) super.getSplits(conf, numSplits);
    // Find new starts/ends of the filesplit that align with the LZO blocks.

    List<FileSplit> result = new ArrayList<FileSplit>();

    for (FileSplit fileSplit : splits) {
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        if (!LzoInputFormatCommon.isLzoFile(file.toString())) {
            // non-LZO file, keep the input split as is.
            result.add(fileSplit);//from w ww  .j a v a2  s  .c  om
            continue;
        }

        // LZO file, try to split if the .index file was found
        LzoIndex index = indexes.get(file);
        if (index == null) {
            throw new IOException("Index not found for " + file);
        }
        if (index.isEmpty()) {
            // Empty index, keep it as is.
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long lzoStart = index.alignSliceStartToIndex(start, end);
        long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (lzoStart != LzoIndex.NOT_FOUND && lzoEnd != LzoIndex.NOT_FOUND) {
            result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations()));
        }
    }

    return result.toArray(new FileSplit[result.size()]);
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java

License:Open Source License

public RemoteParForColocatedFileSplit(FileSplit split, String fname, int blen) throws IOException {
    super(split.getPath(), split.getStart(), split.getLength(), split.getLocations());

    _fname = fname;/*from   w  w w  .j  ava 2s  .  c  o  m*/
    _blen = blen;
}

From source file:com.ibm.jaql.fail.io.ErrorSplit.java

License:Apache License

public ErrorSplit(FileSplit split, JobConf job, Error e, int cnt) throws IOException {
    child = new FileSplit(split.getPath(), split.getStart(), split.getLength(), split.getLocations());
    error = e;/* w w  w.j a  v  a  2s .  com*/
    count = cnt;
}

From source file:com.ibm.jaql.lang.expr.io.FileSplitToRecordFn.java

License:Apache License

@Override
public JsonRecord eval(Context context) throws Exception {
    // { path: string, start: long, length: long, locations: [string...] }
    if (in == null) {
        in = new DataInputBuffer();
        jpath = new MutableJsonString();
        jstart = new MutableJsonLong();
        jlength = new MutableJsonLong();
        jlocations = new BufferedJsonArray();
        values = new JsonValue[] { jpath, jstart, jlength, jlocations };
        resultRec = new BufferedJsonRecord();
        resultRec.set(NAMES, values, NAMES.length);
    }/*from  w w w .j  a  v  a  2  s  .  co m*/

    JsonRecord splitRec = (JsonRecord) exprs[0].eval(context);

    JsonString jsplitClassName = (JsonString) splitRec.get(InputSplitsFn.CLASS_TAG);
    Class<? extends FileSplit> splitCls = (Class<? extends FileSplit>) ClassLoaderMgr
            .resolveClass(jsplitClassName.toString());
    FileSplit split = (FileSplit) ReflectionUtils.newInstance(splitCls, null);
    JsonBinary rawSplit = (JsonBinary) splitRec.get(InputSplitsFn.SPLIT_TAG);
    in.reset(rawSplit.getInternalBytes(), rawSplit.bytesOffset(), rawSplit.bytesLength());
    split.readFields(in);
    JsonArray jlocs = (JsonArray) splitRec.get(InputSplitsFn.LOCATIONS_TAG);

    jpath.setCopy(split.getPath().toString());
    jstart.set(split.getStart());
    jlength.set(split.getLength());
    if (jlocs != null) {
        values[3] = jlocs;
    } else {
        String[] locs = split.getLocations();
        jlocations.resize(locs.length);
        for (int i = 0; i < locs.length; i++) {
            jlocations.set(i, new JsonString(locs[i]));
        }
        values[3] = jlocations;
    }

    return resultRec;
}

From source file:com.mongodb.hadoop.mapred.BSONFileInputFormat.java

License:Apache License

@Override
public FileSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {

    FileStatus[] inputFiles = listStatus(job);
    List<FileSplit> results = new ArrayList<FileSplit>();
    for (FileStatus file : inputFiles) {
        BSONSplitter splitter = new BSONSplitter();
        splitter.setConf(job);//  w w w  .j  ava  2s  .  c o  m
        splitter.setInputPath(file.getPath());
        Path splitFilePath;
        splitFilePath = new Path(file.getPath().getParent(), "." + file.getPath().getName() + ".splits");
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
            }
            splitter.readSplitsForFile(file);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
        }

        for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : splitter.getAllSplits()) {
            FileSplit fsplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(),
                    split.getLocations());
            results.add(fsplit);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(format("Total of %d found.", results.size()));
    }
    return results.toArray(new FileSplit[results.size()]);
}

From source file:com.moz.fiji.express.flow.framework.MapredInputFormatWrapper.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    initInputFormat(job);//from w ww.j a  v a2  s  . co  m

    try {
        List<org.apache.hadoop.mapreduce.InputSplit> splits = realInputFormat
                .getSplits(HadoopCompat.newJobContext(job, null));

        if (splits == null) {
            return null;
        }

        InputSplit[] resultSplits = new InputSplit[splits.size()];
        int i = 0;
        for (org.apache.hadoop.mapreduce.InputSplit split : splits) {
            if (split.getClass() == org.apache.hadoop.mapreduce.lib.input.FileSplit.class) {
                org.apache.hadoop.mapreduce.lib.input.FileSplit mapreduceFileSplit = ((org.apache.hadoop.mapreduce.lib.input.FileSplit) split);
                resultSplits[i++] = new FileSplit(mapreduceFileSplit.getPath(), mapreduceFileSplit.getStart(),
                        mapreduceFileSplit.getLength(), mapreduceFileSplit.getLocations());
            } else {
                resultSplits[i++] = new InputSplitWrapper(split);
            }
        }

        return resultSplits;

    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}