Example usage for org.apache.hadoop.mapreduce InputSplit getLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLocations.

Prototype

public abstract String[] getLocations() throws IOException, InterruptedException;

Source Link

Document

Get the list of nodes by name where the data for the split would be local.

Usage

From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java

License:Apache License

public static List<List<InputSplit>> getCombinePigSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize, Configuration conf) throws IOException, InterruptedException {
    ArrayList<Node> nodes = new ArrayList<Node>();
    HashMap<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;//from w w w. ja v  a 2s  .c om
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }
    /* verification code: debug purpose
    {
      ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
      HashSet<InputSplit> seen = new HashSet<InputSplit>();
      for (Node node : nodes) {
    if (node.getLength() > 0)
    {
      ArrayList<ComparableSplit> splits = node.getSplits();
      for (ComparableSplit split : splits) {
        if (!seen.contains(split.getSplit())) {
          // remove duplicates. The set has to be on the raw input split not the
          // comparable input split as the latter overrides the compareTo method
          // so its equality semantics is changed and not we want here
          seen.add(split.getSplit());
          leftoverSplits.add(split);
        }
      }
    }
      }
            
      int combinedSplitLen = 0;
      for (PigSplit split : result)
    combinedSplitLen += split.getNumPaths();
      if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) {
    throw new AssertionError("number of combined splits {"+combinedSplitLen+"+"+leftoverSplits.size()+"-"+size+"} does not match the number of original splits ["+nSplits+"].");
      }
    }
    */
    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            ArrayList<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        HashSet<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        /* verification code
        int combinedSplitLen = 0;
        for (PigSplit split : result)
          combinedSplitLen += split.getNumPaths();
        if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt)
          throw new AssertionError("number of combined splits ["+combinedSplitLen+"+"+leftoverSplits.size()+"] does not match the number of original splits ["+nSplits+"].");
        */
        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    /* verification codes
    int combinedSplitLen = 0;
    for (PigSplit split : result)
      combinedSplitLen += split.getNumPaths();
    if (combinedSplitLen != nSplits-emptyCnt)
      throw new AssertionError("number of combined splits ["+combinedSplitLen+"] does not match the number of original splits ["+nSplits+"].");
            
    long totalLen = 0;
    for (PigSplit split : result)
      totalLen += split.getLength();
            
    long origTotalLen = 0;
    for (InputSplit split : oneInputSplits)
      origTotalLen += split.getLength();
    if (totalLen != origTotalLen)
      throw new AssertionError("The total length ["+totalLen+"] does not match the original ["+origTotalLen+"]");
    */
    log.info("Total input paths (combined) to process : " + result.size());
    return result;
}

From source file:org.deeplearning4j.iterativereduce.impl.reader.CanovaInputSplit.java

License:Apache License

public CanovaInputSplit(InputSplit split) {
    this.split = split;
    try {/* w  w  w  . j  av  a  2 s .c  o  m*/
        String[] locations = split.getLocations();
        uris = new URI[locations.length];
        for (int i = 0; i < locations.length; i++) {
            uris[i] = URI.create(locations[i]);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

}

From source file:org.gridgain.grid.kernal.processors.hadoop.v2.GridHadoopV2Splitter.java

License:Open Source License

/**
 * @param ctx Job context.//w  ww . j av  a 2 s.c o  m
 * @return Collection of mapped splits.
 * @throws GridException If mapping failed.
 */
public static Collection<GridHadoopInputSplit> splitJob(JobContext ctx) throws GridException {
    try {
        InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(),
                ctx.getConfiguration());

        assert format != null;

        List<InputSplit> splits = format.getSplits(ctx);

        Collection<GridHadoopInputSplit> res = new ArrayList<>(splits.size());

        int id = 0;

        for (InputSplit nativeSplit : splits) {
            if (nativeSplit instanceof FileSplit) {
                FileSplit s = (FileSplit) nativeSplit;

                res.add(new GridHadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(),
                        s.getLength()));
            } else
                res.add(GridHadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations()));

            id++;
        }

        return res;
    } catch (IOException | ClassNotFoundException e) {
        throw new GridException(e);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();

        throw new GridInterruptedException(e);
    }
}

From source file:org.janusgraph.hadoop.formats.cassandra.CqlBridgeRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = (ColumnFamilySplit) split;
    Configuration conf = HadoopCompat.getConfiguration(context);
    totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = ConfigHelper.getInputColumnFamily(conf);
    keyspace = ConfigHelper.getInputKeyspace(conf);
    partitioner = ConfigHelper.getInputPartitioner(conf);
    inputColumns = CqlConfigHelper.getInputcolumns(conf);
    userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

    try {//from  w  w w  .  j  a  v a 2 s. co  m
        if (cluster != null) {
            return;
        }
        // create a Cluster instance
        String[] locations = split.getLocations();
        //            cluster = CqlConfigHelper.getInputCluster(locations, conf);
        // disregard the conf as it brings some unforeseen issues.
        cluster = Cluster.builder().addContactPoints(locations).build();
    } catch (Exception e) {
        throw new RuntimeException(
                "Unable to create cluster for table: " + cfName + ", in keyspace: " + keyspace, e);
    }
    // cluster should be represent to a valid cluster now
    session = cluster.connect(quote(keyspace));
    Preconditions.checkNotNull(session, "Can't create connection session");
    //get negotiated serialization protocol
    nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt();

    // If the user provides a CQL query then we will use it without validation
    // otherwise we will fall back to building a query using the:
    //   inputColumns
    //   whereClauses
    cqlQuery = CqlConfigHelper.getInputCql(conf);
    // validate that the user hasn't tried to give us a custom query along with input columns
    // and where clauses
    if (StringUtils.isNotEmpty(cqlQuery)
            && (StringUtils.isNotEmpty(inputColumns) || StringUtils.isNotEmpty(userDefinedWhereClauses))) {
        throw new AssertionError("Cannot define a custom query with input columns and / or where clauses");
    }

    if (StringUtils.isEmpty(cqlQuery)) {
        cqlQuery = buildQuery();
    }
    log.trace("cqlQuery {}", cqlQuery);
    distinctKeyIterator = new DistinctKeyIterator();
    log.trace("created {}", distinctKeyIterator);
}

From source file:org.mrgeo.data.accumulo.image.AccumuloMrsImagePyramidInputFormat.java

License:Apache License

public static RecordReader<TileIdWritable, RasterWritable> makeRecordReader() {
    return new RecordReaderBase<TileIdWritable, RasterWritable>() {

        @Override/*from  www .j  a  v a  2 s.  c o  m*/
        public void initialize(InputSplit inSplit, TaskAttemptContext attempt) throws IOException {

            //        RangeInputSplit ris = (RangeInputSplit) ((TiledInputSplit)inSplit).getWrappedSplit();
            //
            //        log.info("initializing with instance of " + ris.getInstanceName());
            //        log.info("initializing with auths of " + ris.getAuths().toString());
            //        
            //        super.initialize(((TiledInputSplit)inSplit).getWrappedSplit(), attempt);

            log.info("initializing input splits of type " + inSplit.getClass().getCanonicalName());
            String[] locs;
            try {
                locs = inSplit.getLocations();
                for (int x = 0; x < locs.length; x++) {
                    log.info("location " + x + " -> " + locs[x]);
                }
            } catch (InterruptedException ie) {
                ie.printStackTrace();
                return;
            }
            if (inSplit instanceof TiledInputSplit) {

                // deal with this
                org.apache.accumulo.core.client.mapreduce.RangeInputSplit ris = new org.apache.accumulo.core.client.mapreduce.RangeInputSplit();
                InputSplit inS = ((TiledInputSplit) inSplit).getWrappedSplit();
                log.info("input split class: " + inS.getClass().getCanonicalName());
                long startId = ((TiledInputSplit) inSplit).getStartTileId();
                long endId = ((TiledInputSplit) inSplit).getEndTileId();
                Key startKey = AccumuloUtils.toKey(startId);
                Key endKey = AccumuloUtils.toKey(endId);
                int zoomL = ((TiledInputSplit) inSplit).getZoomLevel();
                Range r = new Range(startKey, endKey);

                log.info("Zoom Level = " + zoomL);
                log.info("Range " + startId + " to " + endId);

                try {
                    locs = inS.getLocations();
                    for (int x = 0; x < locs.length; x++) {
                        log.info("split " + x + " -> " + locs[x]);
                    }
                    ris.setRange(r);
                    ris.setLocations(locs);
                    ris.setTableName(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableName());
                    ris.setTableId(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableId());

                    // there can be more added here

                } catch (InterruptedException ie) {
                    throw new RuntimeErrorException(new Error(ie.getMessage()));
                }
                if (ris == null) {
                    log.info("range input split is null");
                } else {
                    log.info("table " + ris.getTableName() + " is offline: " + ris.isOffline());
                }
                super.initialize(ris, attempt);

                //super.initialize(((TiledInputSplit) inSplit).getWrappedSplit(), attempt);

            } else {
                super.initialize(inSplit, attempt);
            }

        } // end initialize

        @Override
        public void close() {
            log.info("Record Reader closing!");
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if (scannerIterator.hasNext()) {
                ++numKeysRead;
                Entry<Key, Value> entry = scannerIterator.next();
                // transform key and value
                long id = AccumuloUtils.toLong(entry.getKey().getRow());
                currentKey = entry.getKey();
                //currentValue = entry.getValue();

                log.info("Processing " + id + " -> " + entry.getValue().getSize());

                currentK = new TileIdWritable(id);
                currentV = new RasterWritable(entry.getValue().get());

                //log.info("current key = " + id);
                //          if (log.isTraceEnabled())
                //            log.trace("Processing key/value pair: " + DefaultFormatter.formatEntry(entry, true));
                return true;
            }
            return false;
        }
    }; //end RecordReaderBase
}

From source file:org.mrgeo.data.accumulo.image.AccumuloMrsPyramidInputFormat.java

License:Apache License

public static RecordReader<TileIdWritable, RasterWritable> makeRecordReader() {
    return new RecordReaderBase<TileIdWritable, RasterWritable>() {

        @Override//from  w ww .  j  a  v a 2 s  .co  m
        public void initialize(InputSplit inSplit, TaskAttemptContext attempt) throws IOException {

            //        RangeInputSplit ris = (RangeInputSplit) ((TiledInputSplit)inSplit).getWrappedSplit();
            //
            //        log.info("initializing with instance of " + ris.getInstanceName());
            //        log.info("initializing with auths of " + ris.getAuths().toString());
            //
            //        super.initialize(((TiledInputSplit)inSplit).getWrappedSplit(), attempt);

            log.info("initializing input splits of type " + inSplit.getClass().getCanonicalName());
            String[] locs;
            try {
                locs = inSplit.getLocations();
                for (int x = 0; x < locs.length; x++) {
                    log.info("location " + x + " -> " + locs[x]);
                }
            } catch (InterruptedException ie) {
                log.error("Exception thrown", ie);
                return;
            }
            if (inSplit instanceof TiledInputSplit) {

                // deal with this
                org.apache.accumulo.core.client.mapreduce.RangeInputSplit ris = new org.apache.accumulo.core.client.mapreduce.RangeInputSplit();
                InputSplit inS = ((TiledInputSplit) inSplit).getWrappedSplit();
                log.info("input split class: " + inS.getClass().getCanonicalName());
                long startId = ((TiledInputSplit) inSplit).getStartTileId();
                long endId = ((TiledInputSplit) inSplit).getEndTileId();
                Key startKey = AccumuloUtils.toKey(startId);
                Key endKey = AccumuloUtils.toKey(endId);
                int zoomL = ((TiledInputSplit) inSplit).getZoomLevel();
                Range r = new Range(startKey, endKey);

                log.info("Zoom Level = " + zoomL);
                log.info("Range " + startId + " to " + endId);

                try {
                    locs = inS.getLocations();
                    for (int x = 0; x < locs.length; x++) {
                        log.info("split " + x + " -> " + locs[x]);
                    }
                    ris.setRange(r);
                    ris.setLocations(locs);
                    ris.setTableName(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableName());
                    ris.setTableId(
                            ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableId());

                    // there can be more added here

                } catch (InterruptedException ie) {
                    throw new RuntimeErrorException(new Error(ie.getMessage()));
                }
                log.info("table " + ris.getTableName() + " is offline: " + ris.isOffline());
                super.initialize(ris, attempt);

                //super.initialize(((TiledInputSplit) inSplit).getWrappedSplit(), attempt);

            } else {
                super.initialize(inSplit, attempt);
            }

        } // end initialize

        @Override
        public void close() {
            log.info("Record Reader closing!");
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if (scannerIterator.hasNext()) {
                ++numKeysRead;
                Entry<Key, Value> entry = scannerIterator.next();
                // transform key and value
                long id = AccumuloUtils.toLong(entry.getKey().getRow());
                currentKey = entry.getKey();
                //currentValue = entry.getValue();

                log.info("Processing " + id + " -> " + entry.getValue().getSize());

                currentK = new TileIdWritable(id);
                DataInputBuffer dib = new DataInputBuffer();
                byte[] data = entry.getValue().get();
                dib.reset(data, data.length);

                currentV = new RasterWritable();
                currentV.readFields(dib);

                //log.info("current key = " + id);
                //          if (log.isTraceEnabled())
                //            log.trace("Processing key/value pair: " + DefaultFormatter.formatEntry(entry, true));
                return true;
            }
            return false;
        }
    }; //end RecordReaderBase
}