Example usage for org.apache.hadoop.mapreduce InputSplit getLocations

List of usage examples for org.apache.hadoop.mapreduce InputSplit getLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLocations.

Prototype

public abstract String[] getLocations() throws IOException, InterruptedException;

Source Link

Document

Get the list of nodes by name where the data for the split would be local.

Usage

From source file:com.couchbase.sqoop.mapreduce.db.CouchbaseInputFormat.java

License:Apache License

@Override
/** {@inheritDoc} */
public RecordReader<Text, T> createRecordReader(final InputSplit split, final TaskAttemptContext context)
        throws IOException, InterruptedException {
    System.out.printf("Creating Record Reader %s", split.getLocations().toString());
    return createRecordReader(split, context.getConfiguration());
}

From source file:com.google.cloud.hadoop.util.HadoopToStringUtil.java

License:Open Source License

public static String toString(InputSplit input) throws IOException, InterruptedException {
    if (input == null) {
        return "null";
    }//  ww w  .jav  a 2  s. c om

    String result = "InputSplit::";
    result += " length:" + input.getLength();
    result += " locations: " + Arrays.toString(input.getLocations());
    result += " toString(): " + input.toString();
    return result;
}

From source file:com.marklogic.contentpump.CombineDocumentSplit.java

License:Apache License

public CombineDocumentSplit(List<FileSplit> splits) throws IOException, InterruptedException {
    this.splits = splits;
    locations = new HashSet<String>();
    for (InputSplit split : splits) {
        length += split.getLength();//from w  w  w  . j  ava2 s  .  c om
        for (String loc : split.getLocations()) {
            if (!locations.contains(loc)) {
                locations.add(loc);
            }
        }
    }
}

From source file:com.tuplejump.calliope.hadoop.cql3.CqlPagingRecordReader.java

License:Apache License

public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = (ColumnFamilySplit) split;
    Configuration conf = HadoopCompat.getConfiguration(context);
    totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = ConfigHelper.getInputColumnFamily(conf);
    consistencyLevel = ConsistencyLevel.valueOf(ConfigHelper.getReadConsistencyLevel(conf));
    keyspace = ConfigHelper.getInputKeyspace(conf);
    columns = CqlConfigHelper.getInputcolumns(conf);
    userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

    Optional<Integer> pageRowSizeOptional = CqlConfigHelper.getInputPageRowSize(conf);
    try {//from www .  ja v  a 2 s .c o m
        pageRowSize = pageRowSizeOptional.isPresent() ? pageRowSizeOptional.get() : DEFAULT_CQL_PAGE_LIMIT;
    } catch (NumberFormatException e) {
        pageRowSize = DEFAULT_CQL_PAGE_LIMIT;
    }

    partitioner = ConfigHelper.getInputPartitioner(HadoopCompat.getConfiguration(context));

    try {
        if (client != null)
            return;

        // create connection using thrift
        String[] locations = split.getLocations();
        Exception lastException = null;
        for (String location : locations) {
            int port = ConfigHelper.getInputRpcPort(conf);
            try {
                client = CqlPagingInputFormat.createAuthenticatedClient(location, port, conf);
                break;
            } catch (Exception e) {
                lastException = e;
                logger.warn("Failed to create authenticated client to {}:{}", location, port);
            }
        }
        if (client == null && lastException != null)
            throw lastException;

        // retrieve partition keys and cluster keys from system.schema_columnfamilies table
        retrieveKeys();

        client.set_keyspace(keyspace);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    rowIterator = new RowIterator();

    logger.debug("created {}", rowIterator);
}

From source file:com.tuplejump.calliope.hadoop.cql3.CqlRecordReader.java

License:Apache License

private void initializeWithColumnFamilySplit(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = split;
    ColumnFamilySplit cfSplit = (ColumnFamilySplit) split;
    Configuration conf = context.getConfiguration();
    totalRowCount = (cfSplit.getLength() < Long.MAX_VALUE) ? (int) cfSplit.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = quote(ConfigHelper.getInputColumnFamily(conf));
    keyspace = quote(ConfigHelper.getInputKeyspace(conf));
    cqlQuery = CqlConfigHelper.getInputCql(conf);
    partitioner = ConfigHelper.getInputPartitioner(context.getConfiguration());

    try {/* w ww. jav  a 2  s  .com*/
        if (cluster != null)
            return;
        // create connection using thrift
        String[] locations = split.getLocations();

        Exception lastException = null;
        for (String location : locations) {
            try {
                cluster = CqlConfigHelper.getInputCluster(location, conf);
                break;
            } catch (Exception e) {
                lastException = e;
                logger.warn("Failed to create authenticated client to {}", location);
            }
        }
        if (cluster == null && lastException != null)
            throw lastException;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    if (cluster != null) {
        try {
            session = cluster.connect(keyspace);
        } catch (NoHostAvailableException nha) {
            Map<InetSocketAddress, Throwable> errors = nha.getErrors();
            logger.error(errors.toString());
            for (InetSocketAddress isa : errors.keySet()) {
                logger.error("ERROR ON HOST [" + isa.getAddress() + "/" + isa.getPort() + "] ");
                logger.error(errors.get(isa).getMessage());
                logger.error("Connection Timeout:  "
                        + cluster.getConfiguration().getSocketOptions().getConnectTimeoutMillis());
                logger.error("Local connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                        .getCoreConnectionsPerHost(HostDistance.LOCAL));
                logger.error("Remote connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                        .getCoreConnectionsPerHost(HostDistance.REMOTE));
                //logger.error("Connection Timeout:  " + cluster.getConfiguration().getSocketOptions().);
            }
            throw nha;
        }
    }
    rowIterator = new SingleRangeRowIterator();
    logger.debug("created {}", rowIterator);
}

From source file:com.tuplejump.calliope.hadoop.cql3.CqlRecordReader.java

License:Apache License

private void initializeWithMultiRangeSplit(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = split;
    MultiRangeSplit cfSplit = (MultiRangeSplit) split;
    Configuration conf = context.getConfiguration();
    totalRowCount = (cfSplit.getLength() < Long.MAX_VALUE) ? (int) cfSplit.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = quote(ConfigHelper.getInputColumnFamily(conf));
    keyspace = quote(ConfigHelper.getInputKeyspace(conf));
    cqlQuery = CqlConfigHelper.getInputCql(conf);
    partitioner = ConfigHelper.getInputPartitioner(context.getConfiguration());

    try {//w ww . j a va 2 s . c  o m
        if (cluster != null)
            return;
        // create connection using thrift
        String[] locations = split.getLocations();

        Exception lastException = null;
        for (String location : locations) {
            try {
                cluster = CqlConfigHelper.getInputCluster(location, conf);
                break;
            } catch (Exception e) {
                lastException = e;
                logger.warn("Failed to create authenticated client to {}", location);
            }
        }
        if (cluster == null && lastException != null)
            throw lastException;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    if (cluster != null) {
        try {
            session = cluster.connect(keyspace);
        } catch (NoHostAvailableException nha) {
            Map<InetSocketAddress, Throwable> errors = nha.getErrors();
            logger.error(errors.toString());
            for (InetSocketAddress isa : errors.keySet()) {
                logger.error("ERROR ON HOST [" + isa.getAddress() + "/" + isa.getPort() + "] ");
                logger.error(errors.get(isa).getMessage());
                logger.error("Connection Timeout:  "
                        + cluster.getConfiguration().getSocketOptions().getConnectTimeoutMillis());
                logger.error("Local connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                        .getCoreConnectionsPerHost(HostDistance.LOCAL));
                logger.error("Remote connection limit:  " + cluster.getConfiguration().getPoolingOptions()
                        .getCoreConnectionsPerHost(HostDistance.REMOTE));
                //logger.error("Connection Timeout:  " + cluster.getConfiguration().getSocketOptions().);
            }
            throw nha;
        }
    }
    rowIterator = new MultiRangeRowIterator();
    logger.debug("created {}", rowIterator);
}

From source file:edu.american.student.redis.hadoop.RedisBigTableRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    String[] locations = split.getLocations();
    RedisBigTableKey key = RedisBigTableKey.inflate(locations[0].getBytes());
    byte[] row = key.getRow();
    byte[] cf = key.getColumnFamily();
    byte[] cq = key.getColumnQualifier();
    RedisForeman foreman = new RedisForeman();
    foreman.connect();//  w w w  . ja  v  a2 s.c  om
    Map<RedisBigTableKey, byte[]> keyValues;
    try {
        keyValues = foreman.getByQualifier(locations[1].getBytes(), row, cf, cq);
        totalKVs = keyValues.size();
        keyValueMapIter = keyValues.entrySet().iterator();
    } catch (RedisForemanException e) {
        throw new IOException(MessageFactory.objective("Read Split").toString(), e);
    }
}

From source file:grakn.core.server.session.reader.GraknCqlBridgeRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    this.split = (ColumnFamilySplit) split;
    Configuration conf = HadoopCompat.getConfiguration(context);
    totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength()
            : ConfigHelper.getInputSplitSize(conf);
    cfName = ConfigHelper.getInputColumnFamily(conf);
    keyspace = ConfigHelper.getInputKeyspace(conf);
    partitioner = ConfigHelper.getInputPartitioner(conf);
    inputColumns = CqlConfigHelper.getInputcolumns(conf);
    userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

    try {//from  ww w.  j  a  v  a2s  . c  om
        if (cluster != null) {
            return;
        }
        // create a Cluster instance
        String[] locations = split.getLocations();
        // Previous implementation of this class was instantiating a new Clutser with the following comment:
        // "disregard the conf as it brings some unforeseen issues."
        // Cluster.builder().addContactPoints(locations).build();

        // The above ignores the config so it's not possible to use it when we need to change default ports
        // as they won't be correctly propagated. So now we create Cluster using conf.
        // If this keeps breaking we might need to investigate further.
        cluster = CqlConfigHelper.getInputCluster(ConfigHelper.getInputInitialAddress(conf).split(","), conf);
    } catch (Exception e) {
        throw new RuntimeException(
                "Unable to create cluster for table: " + cfName + ", in keyspace: " + keyspace, e);
    }
    // cluster should be represent to a valid cluster now
    session = cluster.connect(quote(keyspace));
    Preconditions.checkState(session != null, "Can't create connection session");
    //get negotiated serialization protocol
    nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt();

    // If the user provides a CQL query then we will use it without validation
    // otherwise we will fall back to building a query using the:
    //   inputColumns
    //   whereClauses
    cqlQuery = CqlConfigHelper.getInputCql(conf);
    // validate that the user hasn't tried to give us a custom query along with input columns
    // and where clauses
    if (StringUtils.isNotEmpty(cqlQuery)
            && (StringUtils.isNotEmpty(inputColumns) || StringUtils.isNotEmpty(userDefinedWhereClauses))) {
        throw new AssertionError("Cannot define a custom query with input columns and / or where clauses");
    }

    if (StringUtils.isEmpty(cqlQuery)) {
        cqlQuery = buildQuery();
    }
    log.trace("cqlQuery {}", cqlQuery);
    distinctKeyIterator = new DistinctKeyIterator();
    log.trace("created {}", distinctKeyIterator);
}

From source file:ml.shifu.guagua.mapreduce.GuaguaInputFormat.java

License:Apache License

public static List<List<InputSplit>> getCombineGuaguaSplits(List<InputSplit> oneInputSplits,
        long maxCombinedSplitSize) throws IOException, InterruptedException {
    List<Node> nodes = new ArrayList<Node>();
    Map<String, Node> nodeMap = new HashMap<String, Node>();
    List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
    List<Long> resultLengths = new ArrayList<Long>();
    long comparableSplitId = 0;

    int size = 0, nSplits = oneInputSplits.size();
    InputSplit lastSplit = null;/*from  ww  w.  j  a v  a 2s .  c  o  m*/
    int emptyCnt = 0;
    for (InputSplit split : oneInputSplits) {
        if (split.getLength() == 0) {
            emptyCnt++;
            continue;
        }
        if (split.getLength() >= maxCombinedSplitSize) {
            comparableSplitId++;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(split);
            result.add(combinedSplits);
            resultLengths.add(split.getLength());
        } else {
            ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
            String[] locations = split.getLocations();
            // sort the locations to stabilize the number of maps: PIG-1757
            Arrays.sort(locations);
            HashSet<String> locationSeen = new HashSet<String>();
            for (String location : locations) {
                if (!locationSeen.contains(location)) {
                    Node node = nodeMap.get(location);
                    if (node == null) {
                        node = new Node();
                        nodes.add(node);
                        nodeMap.put(location, node);
                    }
                    node.add(csplit);
                    csplit.add(node);
                    locationSeen.add(location);
                }
            }
            lastSplit = split;
            size++;
        }
    }

    if (nSplits > 0 && emptyCnt == nSplits) {
        // if all splits are empty, add a single empty split as currently an empty directory is
        // not properly handled somewhere
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(oneInputSplits.get(0));
        result.add(combinedSplits);
    } else if (size == 1) {
        List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
        combinedSplits.add(lastSplit);
        result.add(combinedSplits);
    } else if (size > 1) {
        // combine small splits
        Collections.sort(nodes, nodeComparator);
        DummySplit dummy = new DummySplit();
        // dummy is used to search for next split of suitable size to be combined
        ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
        for (Node node : nodes) {
            // sort the splits on this node in descending order
            node.sort();
            long totalSize = 0;
            List<ComparableSplit> splits = node.getSplits();
            int idx;
            int lenSplits;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
            while (!splits.isEmpty()) {
                combinedSplits.add(splits.get(0).getSplit());
                combinedComparableSplits.add(splits.get(0));
                int startIdx = 1;
                lenSplits = splits.size();
                totalSize += splits.get(0).getSplit().getLength();
                long spaceLeft = maxCombinedSplitSize - totalSize;
                dummy.setLength(spaceLeft);
                idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                        dummyComparableSplit);
                idx = -idx - 1 + startIdx;
                while (idx < lenSplits) {
                    long thisLen = splits.get(idx).getSplit().getLength();
                    combinedSplits.add(splits.get(idx).getSplit());
                    combinedComparableSplits.add(splits.get(idx));
                    totalSize += thisLen;
                    spaceLeft -= thisLen;
                    if (spaceLeft <= 0)
                        break;
                    // find next combinable chunk
                    startIdx = idx + 1;
                    if (startIdx >= lenSplits)
                        break;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                            dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                }
                if (totalSize > maxCombinedSplitSize / 2) {
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    removeSplits(combinedComparableSplits);
                    totalSize = 0;
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    splits = node.getSplits();
                } else {
                    if (combinedSplits.size() != lenSplits)
                        throw new AssertionError("Combined split logic error!");
                    break;
                }
            }
        }
        // handle leftovers
        List<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
        Set<InputSplit> seen = new HashSet<InputSplit>();
        for (Node node : nodes) {
            for (ComparableSplit split : node.getSplits()) {
                if (!seen.contains(split.getSplit())) {
                    // remove duplicates. The set has to be on the raw input split not the
                    // comparable input split as the latter overrides the compareTo method
                    // so its equality semantics is changed and not we want here
                    seen.add(split.getSplit());
                    leftoverSplits.add(split);
                }
            }
        }

        if (!leftoverSplits.isEmpty()) {
            long totalSize = 0;
            List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

            int splitLen = leftoverSplits.size();
            for (int i = 0; i < splitLen; i++) {
                ComparableSplit split = leftoverSplits.get(i);
                long thisLen = split.getSplit().getLength();
                if (totalSize + thisLen >= maxCombinedSplitSize) {
                    removeSplits(combinedComparableSplits);
                    result.add(combinedSplits);
                    resultLengths.add(totalSize);
                    combinedSplits = new ArrayList<InputSplit>();
                    combinedComparableSplits.clear();
                    totalSize = 0;
                }
                combinedSplits.add(split.getSplit());
                combinedComparableSplits.add(split);
                totalSize += split.getSplit().getLength();
                if (i == splitLen - 1) {
                    // last piece: it could be very small, try to see it can be squeezed into any existing splits
                    for (int j = 0; j < result.size(); j++) {
                        if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                            List<InputSplit> isList = result.get(j);
                            for (InputSplit csplit : combinedSplits) {
                                isList.add(csplit);
                            }
                            removeSplits(combinedComparableSplits);
                            combinedSplits.clear();
                            break;
                        }
                    }
                    if (!combinedSplits.isEmpty()) {
                        // last piece can not be squeezed in, create a new combined split for them.
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                    }
                }
            }
        }
    }
    LOG.info("Total input paths (combined) to process : {}", result.size());
    return result;
}

From source file:ml.shifu.guagua.yarn.GuaguaSplitWriter.java

License:Apache License

private static SplitMetaInfo[] writeOldSplits(org.apache.hadoop.mapred.InputSplit[] splits,
        FSDataOutputStream out, Configuration conf) throws IOException {
    SplitMetaInfo[] info = new SplitMetaInfo[splits.length];
    if (splits.length != 0) {
        int i = 0;
        long offset = out.getPos();
        for (org.apache.hadoop.mapred.InputSplit split : splits) {
            long prevLen = out.getPos();
            Text.writeString(out, split.getClass().getName());
            split.write(out);/* w w  w.ja v  a2  s .  co m*/
            long currLen = out.getPos();
            String[] locations = split.getLocations();
            final int max_loc = conf.getInt(MAX_SPLIT_LOCATIONS, 10);
            if (locations.length > max_loc) {
                LOG.warn("Max block location exceeded for split: " + split + " splitsize: " + locations.length
                        + " maxsize: " + max_loc);
                locations = Arrays.copyOf(locations, max_loc);
            }
            info[i++] = new JobSplit.SplitMetaInfo(locations, offset, split.getLength());
            offset += currLen - prevLen;
        }
    }
    return info;
}