Example usage for org.apache.hadoop.mapreduce InputSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLength.

Prototype

public abstract long getLength() throws IOException, InterruptedException;

Source Link

Document

Get the size of the split, so that the input splits can be sorted by size.

Usage

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    FileSplit fsplit = (FileSplit) split;
    if (compressionCodecFactory == null)
        compressionCodecFactory = new CompressionCodecFactory(conf);

    LOG.info("Open a SpatialRecordReader to split: " + split);
    this.path = fsplit.getPath();
    this.start = fsplit.getStart();
    this.end = this.start + split.getLength();
    this.fs = this.path.getFileSystem(conf);
    this.directIn = fs.open(this.path);
    codec = compressionCodecFactory.getCodec(this.path);

    if (codec != null) {
        // Input is compressed, create a decompressor to decompress it
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            // A splittable compression codec, can seek to the desired input pos
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = cIn;/*w w w.j a va 2  s.  c  om*/
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            // take pos from compressed stream as we adjusted both start and end
            // to match with the compressed file
            progressPosition = cIn;
        } else {
            // Non-splittable input, need to start from the beginning
            CompressionInputStream cIn = codec.createInputStream(directIn, decompressor);
            in = cIn;
            progressPosition = cIn;
        }
    } else {
        // Non-compressed file, seek to the desired position and use this stream
        // to get the progress and position
        directIn.seek(start);
        in = directIn;
        progressPosition = directIn;
    }
    this.stockShape = (V) OperationsParams.getShape(conf, "shape");
    this.tempLine = new Text();

    this.lineReader = new LineReader(in);
    bytesRead = 0;

    if (this.start != 0) {
        // Skip until first end-of-line reached
        bytesRead += lineReader.readLine(tempLine);
    }
    if (conf.get(SpatialInputFormat3.InputQueryRange) != null) {
        // Retrieve the input query range to apply on all records
        this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange);
        this.inputQueryMBR = this.inputQueryRange.getMBR();
    }

    // Check if there is an associated global index to read cell boundaries
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent());
    if (gindex == null) {
        cellMBR = new Partition();
        cellMBR.filename = path.getName();
        cellMBR.invalidate();
    } else {
        // Set from the associated partition in the global index
        for (Partition p : gindex) {
            if (p.filename.equals(this.path.getName()))
                cellMBR = p;
        }
    }

    this.value = new ShapeIterator<V>();
    value.setShape(stockShape);
}

From source file:edu.umn.cs.spatialHadoop.OperationsParams.java

License:Open Source License

/**
 * Checks whether the operation should work in local or MapReduce mode. If
 * the job explicitly specifies whether to run in local or MapReduce mode,
 * the specified option is returned. Otherwise, it automatically detects
 * whether to use local or MapReduce based on the input size.
 * // w  w w . j  a  v a2s  . c o  m
 * @return <code>true</code> to run in local mode, <code>false</code> to run
 *         in MapReduce mode.
 * @throws IOException If the underlying job fails with an IOException 
 * @throws InterruptedException If the underlying job was interrupted
 */
public static boolean isLocal(Configuration jobConf, Path... input) throws IOException, InterruptedException {
    final boolean LocalProcessing = true;
    final boolean MapReduceProcessing = false;

    // Whatever is explicitly set has the highest priority
    if (jobConf.get("local") != null)
        return jobConf.getBoolean("local", false);

    // If any of the input files are hidden, use local processing
    for (Path inputFile : input) {
        if (!SpatialSite.NonHiddenFileFilter.accept(inputFile))
            return LocalProcessing;
    }

    if (input.length > MaxSplitsForLocalProcessing) {
        LOG.info("Too many files. Using MapReduce");
        return MapReduceProcessing;
    }

    Job job = new Job(jobConf); // To ensure we don't change the original
    SpatialInputFormat3.setInputPaths(job, input);
    SpatialInputFormat3<Partition, Shape> inputFormat = new SpatialInputFormat3<Partition, Shape>();

    try {
        List<InputSplit> splits = inputFormat.getSplits(job);
        if (splits.size() > MaxSplitsForLocalProcessing)
            return MapReduceProcessing;

        long totalSize = 0;
        for (InputSplit split : splits)
            totalSize += split.getLength();
        if (totalSize > MaxSizeForLocalProcessing) {
            LOG.info("Input size is too large. Using MapReduce");
            return MapReduceProcessing;
        }
        LOG.info("Input size is small enough to use local machine");
        return LocalProcessing;
    } catch (IOException e) {
        LOG.warn("Cannot get splits for input");
        return MapReduceProcessing;
    }
}

From source file:eu.scape_project.pt.mets.hadoop.MetsInputFormat.java

License:Apache License

@Override
public RecordReader<Text, DTO> createRecordReader(InputSplit split, TaskAttemptContext context) {

    try {//from w  w  w .  j a v a 2 s .  c  om
        LOG.debug("split.length = " + split.getLength());
        LOG.debug("split.string = " + split.toString());
    } catch (IOException ex) {
        LOG.error(ex.getMessage());
    } catch (InterruptedException ex) {
        LOG.error(ex.getMessage());
    }
    String tag = context.getConfiguration().get(MetsInputFormat.TAG);
    return new MetsRecordReader(tag);
}

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

public static List<List<InputSplit>> getCombineGuaguaSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize) throws IOException, InterruptedException {
    List<Node> nodes = new ArrayList<Node>();
    Map<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;/*w w  w  .j a v a  2  s  .  c  o m*/
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }

    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            List<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        List<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        Set<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    LOG.info("Total input paths (combined) to process : {}", result.size());
    return result;
}

From source file:ml.shifu.guagua.yarn.GuaguaSplitWriter.java

License:Apache License

private static SplitMetaInfo[] writeOldSplits(org.apache.hadoop.mapred.InputSplit[] splits,
        FSDataOutputStream out, Configuration conf) throws IOException {
    SplitMetaInfo[] info = new SplitMetaInfo[splits.length];
    if (splits.length != 0) {
        int i = 0;
        long offset = out.getPos();
        for (org.apache.hadoop.mapred.InputSplit split : splits) {
            long prevLen = out.getPos();
            Text.writeString(out, split.getClass().getName());
            split.write(out);//  w  w  w  . j av  a  2  s  .  c om
            long currLen = out.getPos();
            String[] locations = split.getLocations();
            final int max_loc = conf.getInt(MAX_SPLIT_LOCATIONS, 10);
            if (locations.length > max_loc) {
                LOG.warn("Max block location exceeded for split: " + split + " splitsize: " + locations.length
                        + " maxsize: " + max_loc);
                locations = Arrays.copyOf(locations, max_loc);
            }
            info[i++] = new JobSplit.SplitMetaInfo(locations, offset, split.getLength());
            offset += currLen - prevLen;
        }
    }
    return info;
}

From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java

License:Apache License

public static List<List<InputSplit>> getCombineGuaguaSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize) throws IOException, InterruptedException {
    ArrayList<Node> nodes = new ArrayList<Node>();
    HashMap<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;// ww w.  ja  v  a 2  s .  co  m
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }

    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            ArrayList<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        HashSet<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    LOG.info("Total input paths (combined) to process : {}", result.size());
    return result;
}

From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java

License:Apache License

public static List<List<InputSplit>> getCombineVarSelectSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize) throws IOException, InterruptedException {
    List<Node> nodes = new ArrayList<Node>();
    Map<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;//ww  w  . j  a va2 s  . c o m
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }

    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            List<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        List<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        Set<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    LOG.info("Total input paths (combined) to process : {}", result.size());
    return result;
}

From source file:nl.surfsara.warcutils.WarcSequenceFileRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) inputSplit;
    Configuration conf = context.getConfiguration();
    final Path path = split.getPath();

    Option optPath = SequenceFile.Reader.file(path);
    in = new SequenceFile.Reader(conf, optPath);

    this.end = split.getStart() + inputSplit.getLength();
    if (split.getStart() > in.getPosition()) {
        in.sync(split.getStart());//from  w  w  w.ja  v  a2s  .c o m
    }
    start = in.getPosition();
    done = start >= end;
}

From source file:org.apache.carbondata.hadoop.CarbonMultiBlockSplit.java

License:Apache License

@Override
public long getLength() throws IOException, InterruptedException {
    long total = 0;
    for (InputSplit split : splitList) {
        total += split.getLength();
    }/*  ww w . j  a  v a2s .  co  m*/
    return total;
}

From source file:org.apache.hawq.pxf.plugins.hdfs.ParquetDataFragmenter.java

License:Apache License

private List<InputSplit> getSplits(Path path) throws IOException {
    ParquetInputFormat<Group> parquetInputFormat = new ParquetInputFormat<Group>();
    ParquetInputFormat.setInputPaths(job, path);
    List<InputSplit> splits = parquetInputFormat.getSplits(job);
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();

    if (splits != null) {
        for (InputSplit split : splits) {
            try {
                if (split.getLength() > 0) {
                    result.add(split);//from   www.j a  v a 2 s.  com
                }
            } catch (InterruptedException e) {
                throw new RuntimeException("Unable to read split's length", e);
            }
        }
    }

    return result;
}