Example usage for org.apache.hadoop.mapreduce InputSplit getLocations

List of usage examples for org.apache.hadoop.mapreduce InputSplit getLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLocations.

Prototype

public abstract String[] getLocations() throws IOException, InterruptedException;

Source Link

Document

Get the list of nodes by name where the data for the split would be local.

Usage

From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java

License:Apache License

public static List<List<InputSplit>> getCombinePigSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize, Configuration conf) throws IOException, InterruptedException {
    ArrayList<Node> nodes = new ArrayList<Node>();
    HashMap<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;//from w w w. ja v  a 2s  .c om
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }
    /* verification code: debug purpose
    {
      ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
      HashSet<InputSplit> seen = new HashSet<InputSplit>();
      for (Node node : nodes) {
    if (node.getLength() > 0)
    {
      ArrayList<ComparableSplit> splits = node.getSplits();
      for (ComparableSplit split : splits) {
        if (!seen.contains(split.getSplit())) {
          // remove duplicates. The set has to be on the raw input split not the
          // comparable input split as the latter overrides the compareTo method
          // so its equality semantics is changed and not we want here
          seen.add(split.getSplit());
          leftoverSplits.add(split);
        }
      }
    }
      }
            
      int combinedSplitLen = 0;
      for (PigSplit split : result)
    combinedSplitLen += split.getNumPaths();
      if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) {
    throw new AssertionError("number of combined splits {"+combinedSplitLen+"+"+leftoverSplits.size()+"-"+size+"} does not match the number of original splits ["+nSplits+"].");
      }
    }
    */
    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            ArrayList<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        HashSet<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        /* verification code
        int combinedSplitLen = 0;
        for (PigSplit split : result)
          combinedSplitLen += split.getNumPaths();
        if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt)
          throw new AssertionError("number of combined splits ["+combinedSplitLen+"+"+leftoverSplits.size()+"] does not match the number of original splits ["+nSplits+"].");
        */
        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    /* verification codes
    int combinedSplitLen = 0;
    for (PigSplit split : result)
      combinedSplitLen += split.getNumPaths();
    if (combinedSplitLen != nSplits-emptyCnt)
      throw new AssertionError("number of combined splits ["+combinedSplitLen+"] does not match the number of original splits ["+nSplits+"].");
            
    long totalLen = 0;
    for (PigSplit split : result)
      totalLen += split.getLength();
            
    long origTotalLen = 0;
    for (InputSplit split : oneInputSplits)
      origTotalLen += split.getLength();
    if (totalLen != origTotalLen)
      throw new AssertionError("The total length ["+totalLen+"] does not match the original ["+origTotalLen+"]");
    */
    log.info("Total input paths (combined) to process : " + result.size());
    return result;
}

From source file:org.deeplearning4j.iterativereduce.impl.reader.CanovaInputSplit.java

License:Apache License

public CanovaInputSplit(InputSplit split) {
    this.split = split;
    try {/* w  w  w  . j  av  a  2 s .c  o  m*/
        String[] locations = split.getLocations();
        uris = new URI[locations.length];
        for (int i = 0; i < locations.length; i++) {
            uris[i] = URI.create(locations[i]);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

}

From source file:org.gridgain.grid.kernal.processors.hadoop.v2.GridHadoopV2Splitter.java

License:Open Source License

/**
 * @param ctx Job context.//w  ww . j av  a 2 s.c o  m
 * @return Collection of mapped splits.
 * @throws GridException If mapping failed.
 */
public static Collection<GridHadoopInputSplit> splitJob(JobContext ctx) throws GridException {
    try {
        InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(),
                ctx.getConfiguration());

        assert format != null;

        List<InputSplit> splits = format.getSplits(ctx);

        Collection<GridHadoopInputSplit> res = new ArrayList<>(splits.size());

        int id = 0;

        for (InputSplit nativeSplit : splits) {
            if (nativeSplit instanceof FileSplit) {
                FileSplit s = (FileSplit) nativeSplit;

                res.add(new GridHadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(),
                        s.getLength()));
            } else
                res.add(GridHadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations()));

            id++;
        }

        return res;
    } catch (IOException | ClassNotFoundException e) {
        throw new GridException(e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();

        throw new GridInterruptedException(e);
    }
}

From source file:org.janusgraph.hadoop.formats.cassandra.CqlBridgeRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = (ColumnFamilySplit) split;
    Configuration conf = HadoopCompat.getConfiguration(context);
    totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = ConfigHelper.getInputColumnFamily(conf);
    keyspace = ConfigHelper.getInputKeyspace(conf);
    partitioner = ConfigHelper.getInputPartitioner(conf);
    inputColumns = CqlConfigHelper.getInputcolumns(conf);
    userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

    try {//from  w  w w  .  j  a  v a 2 s. co  m
        if (cluster != null) {
            return;
        }
        // create a Cluster instance
        String[] locations = split.getLocations();
        //            cluster = CqlConfigHelper.getInputCluster(locations, conf);
        // disregard the conf as it brings some unforeseen issues.
        cluster = Cluster.builder().addContactPoints(locations).build();
    } catch (Exception e) {
        throw new RuntimeException(
                "Unable to create cluster for table: " + cfName + ", in keyspace: " + keyspace, e);
    }
    // cluster should be represent to a valid cluster now
    session = cluster.connect(quote(keyspace));
    Preconditions.checkNotNull(session, "Can't create connection session");
    //get negotiated serialization protocol
    nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt();

    // If the user provides a CQL query then we will use it without validation
    // otherwise we will fall back to building a query using the:
    //   inputColumns
    //   whereClauses
    cqlQuery = CqlConfigHelper.getInputCql(conf);
    // validate that the user hasn't tried to give us a custom query along with input columns
    // and where clauses
    if (StringUtils.isNotEmpty(cqlQuery)
            && (StringUtils.isNotEmpty(inputColumns) || StringUtils.isNotEmpty(userDefinedWhereClauses))) {
        throw new AssertionError("Cannot define a custom query with input columns and / or where clauses");
    }

    if (StringUtils.isEmpty(cqlQuery)) {
        cqlQuery = buildQuery();
    }
    log.trace("cqlQuery {}", cqlQuery);
    distinctKeyIterator = new DistinctKeyIterator();
    log.trace("created {}", distinctKeyIterator);
}

From source file:org.mrgeo.data.accumulo.image.AccumuloMrsImagePyramidInputFormat.java

License:Apache License

public static RecordReader<TileIdWritable, RasterWritable> makeRecordReader() {
    return new RecordReaderBase<TileIdWritable, RasterWritable>() {

        @Override/*from  www .j  a  v a  2 s.  c o  m*/
        public void initialize(InputSplit inSplit, TaskAttemptContext attempt) throws IOException {

            //        RangeInputSplit ris = (RangeInputSplit) ((TiledInputSplit)inSplit).getWrappedSplit();
            //
            //        log.info("initializing with instance of " + ris.getInstanceName());
            //        log.info("initializing with auths of " + ris.getAuths().toString());
            //        
            //        super.initialize(((TiledInputSplit)inSplit).getWrappedSplit(), attempt);

            log.info("initializing input splits of type " + inSplit.getClass().getCanonicalName());
            String[] locs;
            try {
                locs = inSplit.getLocations();
                for (int x = 0; x < locs.length; x++) {
                    log.info("location " + x + " -> " + locs[x]);
                }
            } catch (InterruptedException ie) {
                ie.printStackTrace();
                return;
            }
            if (inSplit instanceof TiledInputSplit) {

                // deal with this
                org.apache.accumulo.core.client.mapreduce.RangeInputSplit ris = new org.apache.accumulo.core.client.mapreduce.RangeInputSplit();
                InputSplit inS = ((TiledInputSplit) inSplit).getWrappedSplit();
                log.info("input split class: " + inS.getClass().getCanonicalName());
                long startId = ((TiledInputSplit) inSplit).getStartTileId();
                long endId = ((TiledInputSplit) inSplit).getEndTileId();
                Key startKey = AccumuloUtils.toKey(startId);
                Key endKey = AccumuloUtils.toKey(endId);
                int zoomL = ((TiledInputSplit) inSplit).getZoomLevel();
                Range r = new Range(startKey, endKey);

                log.info("Zoom Level = " + zoomL);
                log.info("Range " + startId + " to " + endId);

                try {
                    locs = inS.getLocations();
                    for (int x = 0; x < locs.length; x++) {
                        log.info("split " + x + " -> " + locs[x]);
                    }
                    ris.setRange(r);
                    ris.setLocations(locs);
                    ris.setTableName(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableName());
                    ris.setTableId(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableId());

                    // there can be more added here

                } catch (InterruptedException ie) {
                    throw new RuntimeErrorException(new Error(ie.getMessage()));
                }
                if (ris == null) {
                    log.info("range input split is null");
                } else {
                    log.info("table " + ris.getTableName() + " is offline: " + ris.isOffline());
                }
                super.initialize(ris, attempt);

                //super.initialize(((TiledInputSplit) inSplit).getWrappedSplit(), attempt);

            } else {
                super.initialize(inSplit, attempt);
            }

        } // end initialize

        @Override
        public void close() {
            log.info("Record Reader closing!");
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if (scannerIterator.hasNext()) {
                ++numKeysRead;
                Entry<Key, Value> entry = scannerIterator.next();
                // transform key and value
                long id = AccumuloUtils.toLong(entry.getKey().getRow());
                currentKey = entry.getKey();
                //currentValue = entry.getValue();

                log.info("Processing " + id + " -> " + entry.getValue().getSize());

                currentK = new TileIdWritable(id);
                currentV = new RasterWritable(entry.getValue().get());

                //log.info("current key = " + id);
                //          if (log.isTraceEnabled())
                //            log.trace("Processing key/value pair: " + DefaultFormatter.formatEntry(entry, true));
                return true;
            }
            return false;
        }
    }; //end RecordReaderBase
}

From source file:org.mrgeo.data.accumulo.image.AccumuloMrsPyramidInputFormat.java

License:Apache License

public static RecordReader<TileIdWritable, RasterWritable> makeRecordReader() {
    return new RecordReaderBase<TileIdWritable, RasterWritable>() {

        @Override//from  w ww .  j  a  v a 2 s  .co  m
        public void initialize(InputSplit inSplit, TaskAttemptContext attempt) throws IOException {

            //        RangeInputSplit ris = (RangeInputSplit) ((TiledInputSplit)inSplit).getWrappedSplit();
            //
            //        log.info("initializing with instance of " + ris.getInstanceName());
            //        log.info("initializing with auths of " + ris.getAuths().toString());
            //
            //        super.initialize(((TiledInputSplit)inSplit).getWrappedSplit(), attempt);

            log.info("initializing input splits of type " + inSplit.getClass().getCanonicalName());
            String[] locs;
            try {
                locs = inSplit.getLocations();
                for (int x = 0; x < locs.length; x++) {
                    log.info("location " + x + " -> " + locs[x]);
                }
            } catch (InterruptedException ie) {
                log.error("Exception thrown", ie);
                return;
            }
            if (inSplit instanceof TiledInputSplit) {

                // deal with this
                org.apache.accumulo.core.client.mapreduce.RangeInputSplit ris = new org.apache.accumulo.core.client.mapreduce.RangeInputSplit();
                InputSplit inS = ((TiledInputSplit) inSplit).getWrappedSplit();
                log.info("input split class: " + inS.getClass().getCanonicalName());
                long startId = ((TiledInputSplit) inSplit).getStartTileId();
                long endId = ((TiledInputSplit) inSplit).getEndTileId();
                Key startKey = AccumuloUtils.toKey(startId);
                Key endKey = AccumuloUtils.toKey(endId);
                int zoomL = ((TiledInputSplit) inSplit).getZoomLevel();
                Range r = new Range(startKey, endKey);

                log.info("Zoom Level = " + zoomL);
                log.info("Range " + startId + " to " + endId);

                try {
                    locs = inS.getLocations();
                    for (int x = 0; x < locs.length; x++) {
                        log.info("split " + x + " -> " + locs[x]);
                    }
                    ris.setRange(r);
                    ris.setLocations(locs);
                    ris.setTableName(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableName());
                    ris.setTableId(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableId());

                    // there can be more added here

                } catch (InterruptedException ie) {
                    throw new RuntimeErrorException(new Error(ie.getMessage()));
                }
                log.info("table " + ris.getTableName() + " is offline: " + ris.isOffline());
                super.initialize(ris, attempt);

                //super.initialize(((TiledInputSplit) inSplit).getWrappedSplit(), attempt);

            } else {
                super.initialize(inSplit, attempt);
            }

        } // end initialize

        @Override
        public void close() {
            log.info("Record Reader closing!");
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if (scannerIterator.hasNext()) {
                ++numKeysRead;
                Entry<Key, Value> entry = scannerIterator.next();
                // transform key and value
                long id = AccumuloUtils.toLong(entry.getKey().getRow());
                currentKey = entry.getKey();
                //currentValue = entry.getValue();

                log.info("Processing " + id + " -> " + entry.getValue().getSize());

                currentK = new TileIdWritable(id);
                DataInputBuffer dib = new DataInputBuffer();
                byte[] data = entry.getValue().get();
                dib.reset(data, data.length);

                currentV = new RasterWritable();
                currentV.readFields(dib);

                //log.info("current key = " + id);
                //          if (log.isTraceEnabled())
                //            log.trace("Processing key/value pair: " + DefaultFormatter.formatEntry(entry, true));
                return true;
            }
            return false;
        }
    }; //end RecordReaderBase
}