List of usage examples for org.apache.hadoop.mapreduce InputSplit getLocations
public abstract String[] getLocations() throws IOException, InterruptedException;
From source file:org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil.java
License:Apache License
public static List<List<InputSplit>> getCombinePigSplits(List<InputSplit> oneInputSplits, long maxCombinedSplitSize, Configuration conf) throws IOException, InterruptedException { ArrayList<Node> nodes = new ArrayList<Node>(); HashMap<String, Node> nodeMap = new HashMap<String, Node>(); List<List<InputSplit>> result = new ArrayList<List<InputSplit>>(); List<Long> resultLengths = new ArrayList<Long>(); long comparableSplitId = 0; int size = 0, nSplits = oneInputSplits.size(); InputSplit lastSplit = null;//from w w w. ja v a 2s .c om int emptyCnt = 0; for (InputSplit split : oneInputSplits) { if (split.getLength() == 0) { emptyCnt++; continue; } if (split.getLength() >= maxCombinedSplitSize) { comparableSplitId++; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(split); result.add(combinedSplits); resultLengths.add(split.getLength()); } else { ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++); String[] locations = split.getLocations(); // sort the locations to stabilize the number of maps: PIG-1757 Arrays.sort(locations); HashSet<String> locationSeen = new HashSet<String>(); for (String location : locations) { if (!locationSeen.contains(location)) { Node node = nodeMap.get(location); if (node == null) { node = new Node(); nodes.add(node); nodeMap.put(location, node); } node.add(csplit); csplit.add(node); locationSeen.add(location); } } lastSplit = split; size++; } } /* verification code: debug purpose { ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>(); HashSet<InputSplit> seen = new HashSet<InputSplit>(); for (Node node : nodes) { if (node.getLength() > 0) { ArrayList<ComparableSplit> splits = node.getSplits(); for (ComparableSplit split : splits) { if (!seen.contains(split.getSplit())) { // remove duplicates. The set has to be on the raw input split not the // comparable input split as the latter overrides the compareTo method // so its equality semantics is changed and not we want here seen.add(split.getSplit()); leftoverSplits.add(split); } } } } int combinedSplitLen = 0; for (PigSplit split : result) combinedSplitLen += split.getNumPaths(); if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) { throw new AssertionError("number of combined splits {"+combinedSplitLen+"+"+leftoverSplits.size()+"-"+size+"} does not match the number of original splits ["+nSplits+"]."); } } */ if (nSplits > 0 && emptyCnt == nSplits) { // if all splits are empty, add a single empty split as currently an empty directory is // not properly handled somewhere ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(oneInputSplits.get(0)); result.add(combinedSplits); } else if (size == 1) { ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); combinedSplits.add(lastSplit); result.add(combinedSplits); } else if (size > 1) { // combine small splits Collections.sort(nodes, nodeComparator); DummySplit dummy = new DummySplit(); // dummy is used to search for next split of suitable size to be combined ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1); for (Node node : nodes) { // sort the splits on this node in descending order node.sort(); long totalSize = 0; ArrayList<ComparableSplit> splits = node.getSplits(); int idx; int lenSplits; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); while (!splits.isEmpty()) { combinedSplits.add(splits.get(0).getSplit()); combinedComparableSplits.add(splits.get(0)); int startIdx = 1; lenSplits = splits.size(); totalSize += splits.get(0).getSplit().getLength(); long spaceLeft = maxCombinedSplitSize - totalSize; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; while (idx < lenSplits) { long thisLen = splits.get(idx).getSplit().getLength(); combinedSplits.add(splits.get(idx).getSplit()); combinedComparableSplits.add(splits.get(idx)); totalSize += thisLen; spaceLeft -= thisLen; if (spaceLeft <= 0) break; // find next combinable chunk startIdx = idx + 1; if (startIdx >= lenSplits) break; dummy.setLength(spaceLeft); idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit); idx = -idx - 1 + startIdx; } if (totalSize > maxCombinedSplitSize / 2) { result.add(combinedSplits); resultLengths.add(totalSize); removeSplits(combinedComparableSplits); totalSize = 0; combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); splits = node.getSplits(); } else { if (combinedSplits.size() != lenSplits) throw new AssertionError("Combined split logic error!"); break; } } } // handle leftovers ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>(); HashSet<InputSplit> seen = new HashSet<InputSplit>(); for (Node node : nodes) { for (ComparableSplit split : node.getSplits()) { if (!seen.contains(split.getSplit())) { // remove duplicates. The set has to be on the raw input split not the // comparable input split as the latter overrides the compareTo method // so its equality semantics is changed and not we want here seen.add(split.getSplit()); leftoverSplits.add(split); } } } /* verification code int combinedSplitLen = 0; for (PigSplit split : result) combinedSplitLen += split.getNumPaths(); if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) throw new AssertionError("number of combined splits ["+combinedSplitLen+"+"+leftoverSplits.size()+"] does not match the number of original splits ["+nSplits+"]."); */ if (!leftoverSplits.isEmpty()) { long totalSize = 0; ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>(); ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>(); int splitLen = leftoverSplits.size(); for (int i = 0; i < splitLen; i++) { ComparableSplit split = leftoverSplits.get(i); long thisLen = split.getSplit().getLength(); if (totalSize + thisLen >= maxCombinedSplitSize) { removeSplits(combinedComparableSplits); result.add(combinedSplits); resultLengths.add(totalSize); combinedSplits = new ArrayList<InputSplit>(); combinedComparableSplits.clear(); totalSize = 0; } combinedSplits.add(split.getSplit()); combinedComparableSplits.add(split); totalSize += split.getSplit().getLength(); if (i == splitLen - 1) { // last piece: it could be very small, try to see it can be squeezed into any existing splits for (int j = 0; j < result.size(); j++) { if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize) { List<InputSplit> isList = result.get(j); for (InputSplit csplit : combinedSplits) { isList.add(csplit); } removeSplits(combinedComparableSplits); combinedSplits.clear(); break; } } if (!combinedSplits.isEmpty()) { // last piece can not be squeezed in, create a new combined split for them. removeSplits(combinedComparableSplits); result.add(combinedSplits); } } } } } /* verification codes int combinedSplitLen = 0; for (PigSplit split : result) combinedSplitLen += split.getNumPaths(); if (combinedSplitLen != nSplits-emptyCnt) throw new AssertionError("number of combined splits ["+combinedSplitLen+"] does not match the number of original splits ["+nSplits+"]."); long totalLen = 0; for (PigSplit split : result) totalLen += split.getLength(); long origTotalLen = 0; for (InputSplit split : oneInputSplits) origTotalLen += split.getLength(); if (totalLen != origTotalLen) throw new AssertionError("The total length ["+totalLen+"] does not match the original ["+origTotalLen+"]"); */ log.info("Total input paths (combined) to process : " + result.size()); return result; }
From source file:org.deeplearning4j.iterativereduce.impl.reader.CanovaInputSplit.java
License:Apache License
public CanovaInputSplit(InputSplit split) { this.split = split; try {/* w w w . j av a 2 s .c o m*/ String[] locations = split.getLocations(); uris = new URI[locations.length]; for (int i = 0; i < locations.length; i++) { uris[i] = URI.create(locations[i]); } } catch (Exception e) { throw new RuntimeException(e); } }
From source file:org.gridgain.grid.kernal.processors.hadoop.v2.GridHadoopV2Splitter.java
License:Open Source License
/** * @param ctx Job context.//w ww . j av a 2 s.c o m * @return Collection of mapped splits. * @throws GridException If mapping failed. */ public static Collection<GridHadoopInputSplit> splitJob(JobContext ctx) throws GridException { try { InputFormat<?, ?> format = ReflectionUtils.newInstance(ctx.getInputFormatClass(), ctx.getConfiguration()); assert format != null; List<InputSplit> splits = format.getSplits(ctx); Collection<GridHadoopInputSplit> res = new ArrayList<>(splits.size()); int id = 0; for (InputSplit nativeSplit : splits) { if (nativeSplit instanceof FileSplit) { FileSplit s = (FileSplit) nativeSplit; res.add(new GridHadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength())); } else res.add(GridHadoopUtils.wrapSplit(id, nativeSplit, nativeSplit.getLocations())); id++; } return res; } catch (IOException | ClassNotFoundException e) { throw new GridException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new GridInterruptedException(e); } }
From source file:org.janusgraph.hadoop.formats.cassandra.CqlBridgeRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { this.split = (ColumnFamilySplit) split; Configuration conf = HadoopCompat.getConfiguration(context); totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength() : ConfigHelper.getInputSplitSize(conf); cfName = ConfigHelper.getInputColumnFamily(conf); keyspace = ConfigHelper.getInputKeyspace(conf); partitioner = ConfigHelper.getInputPartitioner(conf); inputColumns = CqlConfigHelper.getInputcolumns(conf); userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf); try {//from w w w . j a v a 2 s. co m if (cluster != null) { return; } // create a Cluster instance String[] locations = split.getLocations(); // cluster = CqlConfigHelper.getInputCluster(locations, conf); // disregard the conf as it brings some unforeseen issues. cluster = Cluster.builder().addContactPoints(locations).build(); } catch (Exception e) { throw new RuntimeException( "Unable to create cluster for table: " + cfName + ", in keyspace: " + keyspace, e); } // cluster should be represent to a valid cluster now session = cluster.connect(quote(keyspace)); Preconditions.checkNotNull(session, "Can't create connection session"); //get negotiated serialization protocol nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt(); // If the user provides a CQL query then we will use it without validation // otherwise we will fall back to building a query using the: // inputColumns // whereClauses cqlQuery = CqlConfigHelper.getInputCql(conf); // validate that the user hasn't tried to give us a custom query along with input columns // and where clauses if (StringUtils.isNotEmpty(cqlQuery) && (StringUtils.isNotEmpty(inputColumns) || StringUtils.isNotEmpty(userDefinedWhereClauses))) { throw new AssertionError("Cannot define a custom query with input columns and / or where clauses"); } if (StringUtils.isEmpty(cqlQuery)) { cqlQuery = buildQuery(); } log.trace("cqlQuery {}", cqlQuery); distinctKeyIterator = new DistinctKeyIterator(); log.trace("created {}", distinctKeyIterator); }
From source file:org.mrgeo.data.accumulo.image.AccumuloMrsImagePyramidInputFormat.java
License:Apache License
public static RecordReader<TileIdWritable, RasterWritable> makeRecordReader() { return new RecordReaderBase<TileIdWritable, RasterWritable>() { @Override/*from www .j a v a 2 s. c o m*/ public void initialize(InputSplit inSplit, TaskAttemptContext attempt) throws IOException { // RangeInputSplit ris = (RangeInputSplit) ((TiledInputSplit)inSplit).getWrappedSplit(); // // log.info("initializing with instance of " + ris.getInstanceName()); // log.info("initializing with auths of " + ris.getAuths().toString()); // // super.initialize(((TiledInputSplit)inSplit).getWrappedSplit(), attempt); log.info("initializing input splits of type " + inSplit.getClass().getCanonicalName()); String[] locs; try { locs = inSplit.getLocations(); for (int x = 0; x < locs.length; x++) { log.info("location " + x + " -> " + locs[x]); } } catch (InterruptedException ie) { ie.printStackTrace(); return; } if (inSplit instanceof TiledInputSplit) { // deal with this org.apache.accumulo.core.client.mapreduce.RangeInputSplit ris = new org.apache.accumulo.core.client.mapreduce.RangeInputSplit(); InputSplit inS = ((TiledInputSplit) inSplit).getWrappedSplit(); log.info("input split class: " + inS.getClass().getCanonicalName()); long startId = ((TiledInputSplit) inSplit).getStartTileId(); long endId = ((TiledInputSplit) inSplit).getEndTileId(); Key startKey = AccumuloUtils.toKey(startId); Key endKey = AccumuloUtils.toKey(endId); int zoomL = ((TiledInputSplit) inSplit).getZoomLevel(); Range r = new Range(startKey, endKey); log.info("Zoom Level = " + zoomL); log.info("Range " + startId + " to " + endId); try { locs = inS.getLocations(); for (int x = 0; x < locs.length; x++) { log.info("split " + x + " -> " + locs[x]); } ris.setRange(r); ris.setLocations(locs); ris.setTableName( ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableName()); ris.setTableId( ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableId()); // there can be more added here } catch (InterruptedException ie) { throw new RuntimeErrorException(new Error(ie.getMessage())); } if (ris == null) { log.info("range input split is null"); } else { log.info("table " + ris.getTableName() + " is offline: " + ris.isOffline()); } super.initialize(ris, attempt); //super.initialize(((TiledInputSplit) inSplit).getWrappedSplit(), attempt); } else { super.initialize(inSplit, attempt); } } // end initialize @Override public void close() { log.info("Record Reader closing!"); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (scannerIterator.hasNext()) { ++numKeysRead; Entry<Key, Value> entry = scannerIterator.next(); // transform key and value long id = AccumuloUtils.toLong(entry.getKey().getRow()); currentKey = entry.getKey(); //currentValue = entry.getValue(); log.info("Processing " + id + " -> " + entry.getValue().getSize()); currentK = new TileIdWritable(id); currentV = new RasterWritable(entry.getValue().get()); //log.info("current key = " + id); // if (log.isTraceEnabled()) // log.trace("Processing key/value pair: " + DefaultFormatter.formatEntry(entry, true)); return true; } return false; } }; //end RecordReaderBase }
From source file:org.mrgeo.data.accumulo.image.AccumuloMrsPyramidInputFormat.java
License:Apache License
public static RecordReader<TileIdWritable, RasterWritable> makeRecordReader() { return new RecordReaderBase<TileIdWritable, RasterWritable>() { @Override//from w ww . j a v a 2 s .co m public void initialize(InputSplit inSplit, TaskAttemptContext attempt) throws IOException { // RangeInputSplit ris = (RangeInputSplit) ((TiledInputSplit)inSplit).getWrappedSplit(); // // log.info("initializing with instance of " + ris.getInstanceName()); // log.info("initializing with auths of " + ris.getAuths().toString()); // // super.initialize(((TiledInputSplit)inSplit).getWrappedSplit(), attempt); log.info("initializing input splits of type " + inSplit.getClass().getCanonicalName()); String[] locs; try { locs = inSplit.getLocations(); for (int x = 0; x < locs.length; x++) { log.info("location " + x + " -> " + locs[x]); } } catch (InterruptedException ie) { log.error("Exception thrown", ie); return; } if (inSplit instanceof TiledInputSplit) { // deal with this org.apache.accumulo.core.client.mapreduce.RangeInputSplit ris = new org.apache.accumulo.core.client.mapreduce.RangeInputSplit(); InputSplit inS = ((TiledInputSplit) inSplit).getWrappedSplit(); log.info("input split class: " + inS.getClass().getCanonicalName()); long startId = ((TiledInputSplit) inSplit).getStartTileId(); long endId = ((TiledInputSplit) inSplit).getEndTileId(); Key startKey = AccumuloUtils.toKey(startId); Key endKey = AccumuloUtils.toKey(endId); int zoomL = ((TiledInputSplit) inSplit).getZoomLevel(); Range r = new Range(startKey, endKey); log.info("Zoom Level = " + zoomL); log.info("Range " + startId + " to " + endId); try { locs = inS.getLocations(); for (int x = 0; x < locs.length; x++) { log.info("split " + x + " -> " + locs[x]); } ris.setRange(r); ris.setLocations(locs); ris.setTableName( ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableName()); ris.setTableId( ((org.apache.accumulo.core.client.mapreduce.RangeInputSplit) inS).getTableId()); // there can be more added here } catch (InterruptedException ie) { throw new RuntimeErrorException(new Error(ie.getMessage())); } log.info("table " + ris.getTableName() + " is offline: " + ris.isOffline()); super.initialize(ris, attempt); //super.initialize(((TiledInputSplit) inSplit).getWrappedSplit(), attempt); } else { super.initialize(inSplit, attempt); } } // end initialize @Override public void close() { log.info("Record Reader closing!"); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (scannerIterator.hasNext()) { ++numKeysRead; Entry<Key, Value> entry = scannerIterator.next(); // transform key and value long id = AccumuloUtils.toLong(entry.getKey().getRow()); currentKey = entry.getKey(); //currentValue = entry.getValue(); log.info("Processing " + id + " -> " + entry.getValue().getSize()); currentK = new TileIdWritable(id); DataInputBuffer dib = new DataInputBuffer(); byte[] data = entry.getValue().get(); dib.reset(data, data.length); currentV = new RasterWritable(); currentV.readFields(dib); //log.info("current key = " + id); // if (log.isTraceEnabled()) // log.trace("Processing key/value pair: " + DefaultFormatter.formatEntry(entry, true)); return true; } return false; } }; //end RecordReaderBase }