List of usage examples for org.apache.hadoop.mapreduce InputSplit getLocations
public abstract String[] getLocations() throws IOException, InterruptedException;
From source file:ml.shifu.guagua.yarn.util.InputSplitUtils.java
License:Apache License
public static List<List<InputSplit>> getCombineGuaguaSplits(List<InputSplit> oneInputSplits, long maxCombinedSplitSize) throws IOException, InterruptedException { ArrayList<Node> nodes = new ArrayList<Node>(); HashMap<String, Node> nodeMap = new HashMap<String, Node>(); List<List<InputSplit>> result = new ArrayList<List<InputSplit>>(); List<Long> resultLengths = new ArrayList<Long>(); long comparableSplitId = 0; int size = 0, nSplits = oneInputSplits.size(); InputSplit lastSplit = null;/* ww w . j av a 2s . c om*/ int emptyCnt = 0; for (InputSplit split : oneInputSplits) { if (split.getLength() == 0) { emptyCnt++; continue; } if (split.getLength() >= maxCombinedSplitSize) { comparableSplitId++; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(split); result.add(combinedSplits); resultLengths.add(split.getLength()); } else { ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++); String[] locations = split.getLocations(); // sort the locations to stabilize the number of maps: PIG-1757 Arrays.sort(locations); HashSet<String> locationSeen = new HashSet<String>(); for (String location : locations) { if (!locationSeen.contains(location)) { Node node = nodeMap.get(location); if (node == null) { node = new Node(); nodes.add(node); nodeMap.put(location, node); } node.add(csplit); csplit.add(node); locationSeen.add(location); } } lastSplit = split; size++; } } if (nSplits > 0 && emptyCnt == nSplits) { // if all splits are empty, add a single empty split as currently an empty directory is // not properly handled somewhere ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(oneInputSplits.get(0)); result.add(combinedSplits); } else if (size == 1) { ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(lastSplit); result.add(combinedSplits); } else if (size > 1) { // combine small splits Collections.sort(nodes, nodeComparator); DummySplit dummy = new DummySplit(); // dummy is used to search for next split of suitable size to be combined ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1); for (Node node : nodes) { // sort the splits on this node in descending order node.sort(); long totalSize = 0; ArrayList<ComparableSplit> splits = node.getSplits(); int idx; int lenSplits; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); while (!splits.isEmpty()) { combinedSplits.add(splits.get(0).getSplit()); combinedComparableSplits.add(splits.get(0)); int startIdx = 1; lenSplits = splits.size(); totalSize += splits.get(0).getSplit().getLength(); long spaceLeft = maxCombinedSplitSize - totalSize; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; while (idx < lenSplits) { long thisLen = splits.get(idx).getSplit().getLength(); combinedSplits.add(splits.get(idx).getSplit()); combinedComparableSplits.add(splits.get(idx)); totalSize += thisLen; spaceLeft -= thisLen; if (spaceLeft <= 0) break; // find next combinable chunk startIdx = idx + 1; if (startIdx >= lenSplits) break; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; } if (totalSize > maxCombinedSplitSize / 2) { result.add(combinedSplits); resultLengths.add(totalSize); removeSplits(combinedComparableSplits); totalSize = 0; combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); splits = node.getSplits(); } else { if (combinedSplits.size() != lenSplits) throw new AssertionError("Combined split logic error!"); break; } } } // handle leftovers ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>(); HashSet<InputSplit> seen = new HashSet<InputSplit>(); for (Node node : nodes) { for (ComparableSplit split : node.getSplits()) { if (!seen.contains(split.getSplit())) { // remove duplicates. The set has to be on the raw input split not the // comparable input split as the latter overrides the compareTo method // so its equality semantics is changed and not we want here seen.add(split.getSplit()); leftoverSplits.add(split); } } } if (!leftoverSplits.isEmpty()) { long totalSize = 0; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); int splitLen = leftoverSplits.size(); for (int i = 0; i < splitLen; i++) { ComparableSplit split = leftoverSplits.get(i); long thisLen = split.getSplit().getLength(); if (totalSize + thisLen >= maxCombinedSplitSize) { removeSplits(combinedComparableSplits); result.add(combinedSplits); resultLengths.add(totalSize); combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); totalSize = 0; } combinedSplits.add(split.getSplit()); combinedComparableSplits.add(split); totalSize += split.getSplit().getLength(); if (i == splitLen - 1) { // last piece: it could be very small, try to see it can be squeezed into any existing splits for (int j = 0; j < result.size(); j++) { if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) { List<InputSplit> isList = result.get(j); for (InputSplit csplit : combinedSplits) { isList.add(csplit); } removeSplits(combinedComparableSplits); combinedSplits.clear(); break; } } if (!combinedSplits.isEmpty()) { // last piece can not be squeezed in, create a new combined split for them. removeSplits(combinedComparableSplits); result.add(combinedSplits); } } } } } LOG.info("Total input paths (combined) to process : {}", result.size()); return result; }
From source file:ml.shifu.shifu.core.mr.input.CombineInputFormat.java
License:Apache License
public static List<List<InputSplit>> getCombineVarSelectSplits(List<InputSplit> oneInputSplits, long maxCombinedSplitSize) throws IOException, InterruptedException { List<Node> nodes = new ArrayList<Node>(); Map<String, Node> nodeMap = new HashMap<String, Node>(); List<List<InputSplit>> result = new ArrayList<List<InputSplit>>(); List<Long> resultLengths = new ArrayList<Long>(); long comparableSplitId = 0; int size = 0, nSplits = oneInputSplits.size(); InputSplit lastSplit = null;/* w w w . j a v a 2s . c om*/ int emptyCnt = 0; for (InputSplit split : oneInputSplits) { if (split.getLength() == 0) { emptyCnt++; continue; } if (split.getLength() >= maxCombinedSplitSize) { comparableSplitId++; List<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(split); result.add(combinedSplits); resultLengths.add(split.getLength()); } else { ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++); String[] locations = split.getLocations(); // sort the locations to stabilize the number of maps: PIG-1757 Arrays.sort(locations); HashSet<String> locationSeen = new HashSet<String>(); for (String location : locations) { if (!locationSeen.contains(location)) { Node node = nodeMap.get(location); if (node == null) { node = new Node(); nodes.add(node); nodeMap.put(location, node); } node.add(csplit); csplit.add(node); locationSeen.add(location); } } lastSplit = split; size++; } } if (nSplits > 0 && emptyCnt == nSplits) { // if all splits are empty, add a single empty split as currently an empty directory is // not properly handled somewhere List<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(oneInputSplits.get(0)); result.add(combinedSplits); } else if (size == 1) { List<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(lastSplit); result.add(combinedSplits); } else if (size > 1) { // combine small splits Collections.sort(nodes, nodeComparator); DummySplit dummy = new DummySplit(); // dummy is used to search for next split of suitable size to be combined ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1); for (Node node : nodes) { // sort the splits on this node in descending order node.sort(); long totalSize = 0; List<ComparableSplit> splits = node.getSplits(); int idx; int lenSplits; List<InputSplit> combinedSplits = new ArrayList<InputSplit>(); List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); while (!splits.isEmpty()) { combinedSplits.add(splits.get(0).getSplit()); combinedComparableSplits.add(splits.get(0)); int startIdx = 1; lenSplits = splits.size(); totalSize += splits.get(0).getSplit().getLength(); long spaceLeft = maxCombinedSplitSize - totalSize; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; while (idx < lenSplits) { long thisLen = splits.get(idx).getSplit().getLength(); combinedSplits.add(splits.get(idx).getSplit()); combinedComparableSplits.add(splits.get(idx)); totalSize += thisLen; spaceLeft -= thisLen; if (spaceLeft <= 0) break; // find next combinable chunk startIdx = idx + 1; if (startIdx >= lenSplits) break; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; } if (totalSize > maxCombinedSplitSize / 2) { result.add(combinedSplits); resultLengths.add(totalSize); removeSplits(combinedComparableSplits); totalSize = 0; combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); splits = node.getSplits(); } else { if (combinedSplits.size() != lenSplits) throw new AssertionError("Combined split logic error!"); break; } } } // handle leftovers List<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>(); Set<InputSplit> seen = new HashSet<InputSplit>(); for (Node node : nodes) { for (ComparableSplit split : node.getSplits()) { if (!seen.contains(split.getSplit())) { // remove duplicates. The set has to be on the raw input split not the // comparable input split as the latter overrides the compareTo method // so its equality semantics is changed and not we want here seen.add(split.getSplit()); leftoverSplits.add(split); } } } if (!leftoverSplits.isEmpty()) { long totalSize = 0; List<InputSplit> combinedSplits = new ArrayList<InputSplit>(); List<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); int splitLen = leftoverSplits.size(); for (int i = 0; i < splitLen; i++) { ComparableSplit split = leftoverSplits.get(i); long thisLen = split.getSplit().getLength(); if (totalSize + thisLen >= maxCombinedSplitSize) { removeSplits(combinedComparableSplits); result.add(combinedSplits); resultLengths.add(totalSize); combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); totalSize = 0; } combinedSplits.add(split.getSplit()); combinedComparableSplits.add(split); totalSize += split.getSplit().getLength(); if (i == splitLen - 1) { // last piece: it could be very small, try to see it can be squeezed into any existing splits for (int j = 0; j < result.size(); j++) { if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) { List<InputSplit> isList = result.get(j); for (InputSplit csplit : combinedSplits) { isList.add(csplit); } removeSplits(combinedComparableSplits); combinedSplits.clear(); break; } } if (!combinedSplits.isEmpty()) { // last piece can not be squeezed in, create a new combined split for them. removeSplits(combinedComparableSplits); result.add(combinedSplits); } } } } } LOG.info("Total input paths (combined) to process : {}", result.size()); return result; }
From source file:org.apache.cassandra.hadoop.cql3.CqlPagingRecordReader.java
License:Apache License
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { this.split = (ColumnFamilySplit) split; Configuration conf = context.getConfiguration(); totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength() : ConfigHelper.getInputSplitSize(conf); cfName = ConfigHelper.getInputColumnFamily(conf); consistencyLevel = ConsistencyLevel.valueOf(ConfigHelper.getReadConsistencyLevel(conf)); keyspace = ConfigHelper.getInputKeyspace(conf); columns = CqlConfigHelper.getInputcolumns(conf); userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf); try {//w w w. java2s.c om pageRowSize = Integer.parseInt(CqlConfigHelper.getInputPageRowSize(conf)); } catch (NumberFormatException e) { pageRowSize = DEFAULT_CQL_PAGE_LIMIT; } partitioner = ConfigHelper.getInputPartitioner(context.getConfiguration()); try { if (client != null) return; // create connection using thrift String[] locations = split.getLocations(); Exception lastException = null; for (String location : locations) { int port = ConfigHelper.getInputRpcPort(conf); try { client = CqlPagingInputFormat.createAuthenticatedClient(location, port, conf); break; } catch (Exception e) { lastException = e; logger.warn("Failed to create authenticated client to {}:{}", location, port); } } if (client == null && lastException != null) throw lastException; // retrieve partition keys and cluster keys from system.schema_columnfamilies table retrieveKeys(); client.set_keyspace(keyspace); } catch (Exception e) { throw new RuntimeException(e); } rowIterator = new RowIterator(); logger.debug("created {}", rowIterator); }
From source file:org.apache.cassandra.hadoop.cql3.CqlRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { this.split = (ColumnFamilySplit) split; Configuration conf = HadoopCompat.getConfiguration(context); totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength() : ConfigHelper.getInputSplitSize(conf); cfName = ConfigHelper.getInputColumnFamily(conf); keyspace = ConfigHelper.getInputKeyspace(conf); partitioner = ConfigHelper.getInputPartitioner(conf); inputColumns = CqlConfigHelper.getInputcolumns(conf); userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf); try {//from w ww. j a v a 2 s . c o m if (cluster != null) return; // create a Cluster instance String[] locations = split.getLocations(); cluster = CqlConfigHelper.getInputCluster(locations, conf); } catch (Exception e) { throw new RuntimeException(e); } if (cluster != null) session = cluster.connect(quote(keyspace)); if (session == null) throw new RuntimeException("Can't create connection session"); //get negotiated serialization protocol nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt(); // If the user provides a CQL query then we will use it without validation // otherwise we will fall back to building a query using the: // inputColumns // whereClauses cqlQuery = CqlConfigHelper.getInputCql(conf); // validate that the user hasn't tried to give us a custom query along with input columns // and where clauses if (StringUtils.isNotEmpty(cqlQuery) && (StringUtils.isNotEmpty(inputColumns) || StringUtils.isNotEmpty(userDefinedWhereClauses))) { throw new AssertionError("Cannot define a custom query with input columns and / or where clauses"); } if (StringUtils.isEmpty(cqlQuery)) cqlQuery = buildQuery(); logger.debug("cqlQuery {}", cqlQuery); rowIterator = new RowIterator(); logger.debug("created {}", rowIterator); }
From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormat.java
License:Apache License
@VisibleForTesting static Stream<String> getLocations(final List<WindowedDataSegment> segments, final org.apache.hadoop.mapred.InputFormat fio, final JobConf conf) { return segments.stream().sequential().flatMap((final WindowedDataSegment segment) -> { FileInputFormat.setInputPaths(conf, new Path(JobHelper.getURIFromSegment(segment.getSegment()))); try {/*w w w . j a v a 2s .c om*/ return Arrays.stream(fio.getSplits(conf, 1)) .flatMap((final org.apache.hadoop.mapred.InputSplit split) -> { try { return Arrays.stream(split.getLocations()); } catch (final Exception e) { logger.error(e, "Exception getting locations"); return Stream.empty(); } }); } catch (final Exception e) { logger.error(e, "Exception getting splits"); return Stream.empty(); } }); }
From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormatTest.java
License:Apache License
@Test public void testGetLocationsSplitException() throws IOException { final InputFormat fio = EasyMock.mock(InputFormat.class); final org.apache.hadoop.mapred.InputSplit split = EasyMock.mock(org.apache.hadoop.mapred.InputSplit.class); EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split }); EasyMock.expect(split.getLocations()).andThrow(new IOException("testing")); EasyMock.replay(fio, split);/* w ww .j a v a 2s .c o m*/ Assert.assertEquals(0, DatasourceInputFormat.getLocations(segments1.subList(0, 1), fio, config).count()); }
From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormatTest.java
License:Apache License
@Test public void testGetLocations() throws IOException { final InputFormat fio = EasyMock.mock(InputFormat.class); final org.apache.hadoop.mapred.InputSplit split = EasyMock.mock(org.apache.hadoop.mapred.InputSplit.class); EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split }); EasyMock.expect(split.getLocations()).andReturn(new String[] { "s1", "s2" }); EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split }); EasyMock.expect(split.getLocations()).andReturn(new String[] { "s3" }); EasyMock.expect(fio.getSplits(config, 1)).andReturn(new org.apache.hadoop.mapred.InputSplit[] { split }); EasyMock.expect(split.getLocations()).andReturn(new String[] { "s4", "s2" }); EasyMock.replay(fio, split);/*from www . ja v a 2 s. c om*/ Assert.assertArrayEquals(new String[] { "s1", "s2", "s3", "s4", "s2" }, DatasourceInputFormat.getLocations(segments1, fio, config).toArray(String[]::new)); }
From source file:org.apache.ignite.internal.processors.hadoop.impl.v2.HadoopV2Splitter.java
License:Apache License
/** * @param ctx Job context./*from ww w. j a v a 2 s. co m*/ * @return Collection of mapped splits. * @throws IgniteCheckedException If mapping failed. */ public static Collection<HadoopInputSplit> splitJob(JobContext ctx) throws IgniteCheckedException { try { InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(), ctx.getConfiguration()); assert format != null; List<InputSplit> splits = format.getSplits(ctx); Collection<HadoopInputSplit> res = new ArrayList<>(splits.size()); int id = 0; for (InputSplit nativeSplit : splits) { if (nativeSplit instanceof FileSplit) { FileSplit s = (FileSplit) nativeSplit; res.add(new HadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength())); } else res.add(HadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations())); id++; } return res; } catch (IOException | ClassNotFoundException e) { throw new IgniteCheckedException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } }
From source file:org.apache.ignite.internal.processors.hadoop.v2.GridHadoopV2Splitter.java
License:Apache License
/** * @param ctx Job context.// w ww. ja v a2 s . c o m * @return Collection of mapped splits. * @throws IgniteCheckedException If mapping failed. */ public static Collection<GridHadoopInputSplit> splitJob(JobContext ctx) throws IgniteCheckedException { try { InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(), ctx.getConfiguration()); assert format != null; List<InputSplit> splits = format.getSplits(ctx); Collection<GridHadoopInputSplit> res = new ArrayList<>(splits.size()); int id = 0; for (InputSplit nativeSplit : splits) { if (nativeSplit instanceof FileSplit) { FileSplit s = (FileSplit) nativeSplit; res.add(new GridHadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength())); } else res.add(GridHadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations())); id++; } return res; } catch (IOException | ClassNotFoundException e) { throw new IgniteCheckedException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit.java
License:Apache License
@Override @SuppressWarnings("unchecked") public String[] getLocations() throws IOException, InterruptedException { if (locations == null) { HashMap<String, Long> locMap = new HashMap<String, Long>(); Long lenInMap;/*from w w w.j av a2 s . c o m*/ for (InputSplit split : wrappedSplits) { String[] locs = split.getLocations(); for (String loc : locs) { if ((lenInMap = locMap.get(loc)) == null) locMap.put(loc, split.getLength()); else locMap.put(loc, lenInMap + split.getLength()); } } Set<Map.Entry<String, Long>> entrySet = locMap.entrySet(); Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]); Arrays.sort(hostSize, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Entry<String, Long> o1, Entry<String, Long> o2) { long diff = o1.getValue() - o2.getValue(); if (diff < 0) return 1; if (diff > 0) return -1; return 0; } }); // maximum 5 locations are in list: refer to PIG-1648 for more details int nHost = Math.min(hostSize.length, 5); locations = new String[nHost]; for (int i = 0; i < nHost; ++i) { locations[i] = hostSize[i].getKey(); } } return locations; }