Example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileSplit getLocations.

Prototype

@Override
    public String[] getLocations() throws IOException

Source Link

Usage

From source file:bsc.spark.examples.terasort.ehiggs.TeraScheduler.java

License:Apache License

public TeraScheduler(FileSplit[] realSplits, Configuration conf) throws IOException {
    this.realSplits = realSplits;
    this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
    Map<String, Host> hostTable = new HashMap<String, Host>();
    splits = new Split[realSplits.length];
    for (FileSplit realSplit : realSplits) {
        Split split = new Split(realSplit.getPath().toString());
        splits[remainingSplits++] = split;
        for (String hostname : realSplit.getLocations()) {
            Host host = hostTable.get(hostname);
            if (host == null) {
                host = new Host(hostname);
                hostTable.put(hostname, host);
                hosts.add(host);/*ww  w  .  j  ava 2 s  .c  o  m*/
            }
            host.splits.add(split);
            split.locations.add(host);
        }
    }
}

From source file:com.hadoop.mapreduce.FourMcInputFormat.java

License:BSD License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = HadoopUtils.getConfiguration(job);

    List<InputSplit> defaultSplits = super.getSplits(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    Path prevFile = null;/*from ww w.ja  v a 2s  .co m*/
    FourMcBlockIndex prevIndex = null;

    for (InputSplit genericSplit : defaultSplits) {
        // Load the index.
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        FourMcBlockIndex index;
        if (file.equals(prevFile)) {
            index = prevIndex;
        } else {
            index = FourMcBlockIndex.readIndex(fs, file);
            prevFile = file;
            prevIndex = index;
        }

        if (index == null) {
            throw new IOException("BlockIndex unreadable for " + file);
        }

        if (index.isEmpty()) { // leave the default split for empty block index
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long fourMcStart = index.alignSliceStartToIndex(start, end);
        long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (fourMcStart != FourMcBlockIndex.NOT_FOUND && fourMcEnd != FourMcBlockIndex.NOT_FOUND) {
            result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
            LOG.debug("Added 4mc split for " + file + "[start=" + fourMcStart + ", length="
                    + (fourMcEnd - fourMcStart) + "]");
        }

    }

    return result;
}

From source file:com.hadoop.mapreduce.FourMzInputFormat.java

License:BSD License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = HadoopUtils.getConfiguration(job);

    List<InputSplit> defaultSplits = super.getSplits(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    Path prevFile = null;//from  w w  w  .j  av a 2  s .  c  o  m
    FourMzBlockIndex prevIndex = null;

    for (InputSplit genericSplit : defaultSplits) {
        // Load the index.
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);

        FourMzBlockIndex index;
        if (file.equals(prevFile)) {
            index = prevIndex;
        } else {
            index = FourMzBlockIndex.readIndex(fs, file);
            prevFile = file;
            prevIndex = index;
        }

        if (index == null) {
            throw new IOException("BlockIndex unreadable for " + file);
        }

        if (index.isEmpty()) { // leave the default split for empty block index
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long fourMcStart = index.alignSliceStartToIndex(start, end);
        long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (fourMcStart != FourMzBlockIndex.NOT_FOUND && fourMcEnd != FourMzBlockIndex.NOT_FOUND) {
            result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations()));
            LOG.debug("Added 4mz split for " + file + "[start=" + fourMcStart + ", length="
                    + (fourMcEnd - fourMcStart) + "]");
        }

    }

    return result;
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    // find new start/ends of the filesplit that aligns
    // with the lzo blocks

    List<InputSplit> result = new ArrayList<InputSplit>();
    FileSystem fs = FileSystem.get(job.getConfiguration());

    for (InputSplit genericSplit : splits) {
        // load the index
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        LzoIndex index = indexes.get(file);
        if (index == null) {
            throw new IOException("Index not found for " + file);
        }//from  ww w .jav  a  2 s. c  om

        if (index.isEmpty()) {
            // empty index, keep as is
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        if (start != 0) {
            // find the next block position from
            // the start of the split
            long newStart = index.findNextPosition(start);
            if (newStart == -1 || newStart >= end) {
                // just skip this since it will be handled by another split
                continue;
            }
            start = newStart;
        }

        long newEnd = index.findNextPosition(end);
        if (newEnd != -1) {
            end = newEnd;
        } else {
            //didn't find the next position
            //we have hit the end of the file
            end = fs.getFileStatus(file).getLen();
        }

        result.add(new FileSplit(file, start, end - start, fileSplit.getLocations()));
    }

    return result;
}

From source file:com.marklogic.contentpump.CombineDocumentSplit.java

License:Apache License

public void addSplit(FileSplit split) throws IOException, InterruptedException {
    splits.add(split);//from w  w  w.  j a v a  2 s  . c o m
    length += split.getLength();
    for (String loc : split.getLocations()) {
        if (!locations.contains(loc)) {
            locations.add(loc);
        }
    }
}

From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }// w  w w . j av  a  2  s .c  o m

    if (splits.size() >= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter = 0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file : splits) {
        FileSplit fsplit = ((FileSplit) file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);

        if (fsplit.getStart() == 0) {
            // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character.");
            }
            String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                    MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream,
                    CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true));
            Iterator<CSVRecord> it = parser.iterator();

            String[] header = null;
            if (it.hasNext()) {
                CSVRecord record = (CSVRecord) it.next();
                Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                    if (recordIterator.hasNext()) {
                        header[i] = (String) recordIterator.next();
                    } else {
                        throw new IOException("Record size doesn't match the real size");
                    }
                }

                EncodingUtil.handleBOMUTF8(header, 0);

                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }

        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])),
                path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations());
        populatedSplits.add(ds);
    }

    return populatedSplits;
}

From source file:com.scaleoutsoftware.soss.hserver.FileImage.java

License:Apache License

/**
 * Calculates additional splits in case files were modified since the image was last recorded. It relies
 * on the fact that HDFS only allows appending the files. The {@link InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)}
 * is called to get the split list, and then already recorded splits are remowed from that list. In case of partially recorded split,
 * this split is truncated, so it contains only appended part.
 *
 * @param context job context//from ww w.ja va2s  . c  o m
 * @param format  input format
 */
@SuppressWarnings("unchecked")
void addNewSplits(JobContext context, InputFormat format) throws IOException, InterruptedException {
    List<InputSplit> newSplits = format.getSplits(context);
    for (InputSplit inputSplit : newSplits) {
        FileSplit split = (FileSplit) inputSplit;
        String path = split.getPath().toString();
        if (!filesPaths.contains(path))
            throw new IOException("No such file in the recorded image.");
        long currentFileLength = lengths.get(path);

        long begin = split.getStart();
        if (begin >= currentFileLength) //New split is entirely in the appended part of the file
        {
            splits.add(new ImageInputSplit(inputSplit, getImageIdString(), creationTimestamp, splits.size()));
            continue;
        }
        long end = begin + split.getLength();
        if (end <= currentFileLength)
            continue; //New split is entirely in the recorded area, we don't need it

        FileSplit additionalSplit = new FileSplit(split.getPath(), currentFileLength, end - currentFileLength,
                split.getLocations());
        splits.add(new ImageInputSplit(additionalSplit, getImageIdString(), creationTimestamp, splits.size()));
    }
    List<FileStatus> files = getFiles(context, (FileInputFormat) format);
    for (FileStatus file : files) {
        String filePath = file.getPath().toString();
        modificationDate.put(filePath, file.getModificationTime());
        lengths.put(filePath, file.getLen());

    }
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * Go through each original inputsplit, get its file path, and check the
 *  index file,// w  w  w  .  ja  v a 2  s .  c om
 * a)  keep it, when there is no index prebuilt on this file
 *  (or the index file doesn't match with the base file's checksum;
 * b)  remove it when no matching value is found in existing index file;
 * c)  construct new smaller inputsplits using indexed blocks found
 * in the index file;
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    String inputformat = job.getConfiguration().get(REALINPUTFORMAT);
    String valueClass = job.getConfiguration().get(VALUECLASS);

    List<InputSplit> filteredList = new ArrayList<InputSplit>();

    FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass);

    List<InputSplit> splits = realInputFormat.getSplits(job);

    //if indexing jobs, don't skip any input splits.
    //if searching job but no searching filter, skip the index as well.
    if (isIndexingJob(job) || getFilterCondition(job) == null)
        return splits;

    Path prevFile = null; // remember the last input file we saw
    boolean foundIndexedFile = false; // is there a index file for
    // prevFile?
    boolean firstTime = true; // is this the first time we see this file?

    long totalOriginalBytes = 0; //the bytes to be scanned without indexes.
    totalBytesNewSplits = 0;
    long startTime = System.currentTimeMillis();
    LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes");
    Configuration conf = job.getConfiguration();
    long splitMaxSize;

    // for each original input split check if we can filter it out.
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        Path path = fileSplit.getPath();
        splitLength = fileSplit.getLength();
        totalOriginalBytes += fileSplit.getLength();
        splitMaxSize = Math.max(splitLength,
                conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024)));

        /*
         * for each new file we see, we first check if it has been indexed or not;
         * if not, we just add the original input split; if yes, we use the index
         * file to add filtered splits for the file
         */
        if (prevFile != null && path.equals(prevFile)) {
            firstTime = false;
        } else {
            prevFile = path;
            firstTime = true;
            foundIndexedFile = foundIndexFile(job, path);
        }

        // if no index file, we'll have to read all original input
        // splits
        if (!foundIndexedFile)
            filteredList.add(fileSplit);
        else {
            // for each file we only add once its filtered input splits using index
            // file
            if (firstTime) {
                // LOG.info("first time saw " + path
                // + ", adding filtered splits from index file");
                filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize));
            }
        }
    }

    long endTime = System.currentTimeMillis();
    LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: "
            + (endTime - startTime) / 1000);
    LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s",
            totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits)));
    return filteredList;
}

From source file:com.twitter.elephanttwin.retrieval.OneSplitInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = job.getConfiguration();
    FileSplit split = (FileSplit) super.getSplits(job).get(0);

    List<InputSplit> lists = new ArrayList<InputSplit>();

    lists.add(new FileSplit(split.getPath(), conf.getLong(START, 0),
            conf.getLong(END, 0) - conf.getLong(START, 0), split.getLocations()));
    return lists;
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    Configuration jobConf = job.getConfiguration();
    if (jobConf.getInt(CombineSplits, 1) > 1) {
        long t1 = System.currentTimeMillis();
        int combine = jobConf.getInt(CombineSplits, 1);
        /*/*from ww  w.  j  av  a  2 s.c  o m*/
         * Combine splits to reduce number of map tasks. Currently, this is done
         * using a greedy algorithm that combines splits based on how many hosts
         * they share.
         * TODO: Use a graph clustering algorithm where each vertex represents a
         * split, and each edge is weighted with number of shared hosts between
         * the two splits
         */
        Vector<Vector<FileSplit>> openSplits = new Vector<Vector<FileSplit>>();
        int maxNumberOfSplits = (int) Math.ceil((float) splits.size() / combine);
        List<InputSplit> combinedSplits = new Vector<InputSplit>();
        for (InputSplit split : splits) {
            FileSplit fsplit = (FileSplit) split;
            int maxSimilarity = -1; // Best similarity found so far
            int bestFit = -1; // Index of a random open split with max similarity
            int numMatches = 0; // Number of splits with max similarity
            for (int i = 0; i < openSplits.size(); i++) {
                Vector<FileSplit> splitList = openSplits.elementAt(i);
                int similarity = 0;
                for (FileSplit otherSplit : splitList) {
                    for (String host1 : fsplit.getLocations())
                        for (String host2 : otherSplit.getLocations())
                            if (host1.equals(host2))
                                similarity++;
                }
                if (similarity > maxSimilarity) {
                    maxSimilarity = similarity;
                    bestFit = i;
                    numMatches = 1;
                } else if (similarity == maxSimilarity) {
                    numMatches++;
                    // Replace with a probability () for a reservoir sample
                    double random = Math.random();
                    if (random < (double) 1 / numMatches) {
                        // Replace the element in the reservoir
                        bestFit = i;
                    }
                }
            }
            if (maxSimilarity > 0 || (openSplits.size() + combinedSplits.size()) >= maxNumberOfSplits) {
                // Good fit || cannot create more open splits,
                // add it to an existing open split.
                Vector<FileSplit> bestList = openSplits.elementAt(bestFit);
                bestList.add(fsplit);
                if (bestList.size() > combine) {
                    // Reached threshold for this list. Add it to combined splits
                    combinedSplits.add(FileSplitUtil.combineFileSplits(bestList, 0, bestList.size()));
                    // Remove it from open splits
                    openSplits.remove(bestFit);
                }
            } else {
                // Bad fit && can add a new split
                // Create a new open split just for this one
                Vector<FileSplit> newOpenSplit = new Vector<FileSplit>();
                newOpenSplit.add(fsplit);
                openSplits.addElement(newOpenSplit);
            }
        }

        // Add all remaining open splits to the list of combined splits
        for (Vector<FileSplit> openSplit : openSplits) {
            combinedSplits.add(FileSplitUtil.combineFileSplits(openSplit, 0, openSplit.size()));
        }

        String msg = String.format("Combined %d splits into %d combined splits", splits.size(),
                combinedSplits.size());
        splits.clear();
        splits.addAll(combinedSplits);
        long t2 = System.currentTimeMillis();
        LOG.info(msg + " in " + ((t2 - t1) / 1000.0) + " seconds");
    }
    return splits;
}