List of usage examples for org.apache.hadoop.mapred InputSplit getLocations
String[] getLocations() throws IOException;
From source file:alluxio.hadoop.HadoopUtils.java
License:Apache License
/** * Returns a string representation of a {@link InputSplit}. * * @param is Hadoop {@link InputSplit}//from w w w. j av a 2s .c o m * @return its string representation */ public static String toStringHadoopInputSplit(InputSplit is) { StringBuilder sb = new StringBuilder("HadoopInputSplit: "); try { sb.append(" Length: ").append(is.getLength()); sb.append(" , Locations: "); for (String loc : is.getLocations()) { sb.append(loc).append(" ; "); } } catch (IOException e) { LOG.error(e.getMessage()); } return sb.toString(); }
From source file:com.ibm.jaql.lang.expr.io.InputSplitsFn.java
License:Apache License
@Override public JsonIterator iter(Context context) throws Exception { JsonValue iod = exprs[0].eval(context); Adapter adapter = JaqlUtil.getAdapterStore().input.getAdapter(iod); if (!(adapter instanceof HadoopInputAdapter)) { throw new ClassCastException("i/o descriptor must be for an input format"); }/*w w w . j a v a 2s .com*/ HadoopInputAdapter hia = (HadoopInputAdapter) adapter; JobConf conf = new JobConf(); // TODO: allow configuration hia.setParallel(conf); // right thing to do? hia.configure(conf); // right thing to do? int numSplits = conf.getNumMapTasks(); // TODO: allow override final InputSplit[] splits = hia.getSplits(conf, numSplits); final MutableJsonString className = new MutableJsonString(); final MutableJsonBinary rawSplit = new MutableJsonBinary(); final BufferedJsonRecord rec = new BufferedJsonRecord(3); final BufferedJsonArray locArr = new BufferedJsonArray(); rec.add(CLASS_TAG, className); rec.add(SPLIT_TAG, rawSplit); rec.add(LOCATIONS_TAG, locArr); return new JsonIterator(rec) { DataOutputBuffer out = new DataOutputBuffer(); int i = 0; @Override public boolean moveNext() throws Exception { if (i >= splits.length) { return false; } InputSplit split = splits[i++]; className.setCopy(split.getClass().getName()); out.reset(); split.write(out); rawSplit.setCopy(out.getData(), out.getLength()); locArr.clear(); String[] locs = split.getLocations(); if (locs != null) { for (String loc : locs) { locArr.add(new JsonString(loc)); } } return true; } }; }
From source file:com.moz.fiji.hive.FijiTableInputFormat.java
License:Apache License
/** * Returns an object responsible for generating records contained in a * given input split./*from w w w . j a v a2 s . c o m*/ * * @param split The input split to create a record reader for. * @param job The job configuration. * @param reporter A job info reporter (for counters, status, etc.). * @return The record reader. * @throws IOException If there is an error. */ @Override public RecordReader<ImmutableBytesWritable, FijiRowDataWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { LOG.info("Getting record reader {}", split.getLocations()); return new FijiTableRecordReader((FijiTableInputSplit) split, job); }
From source file:com.yahoo.druid.hadoop.HiveDatasourceInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { logger.info("checkPost #5"); String overlordUrl = jobConf.get(CONF_DRUID_OVERLORD_HOSTPORT); Preconditions.checkArgument(overlordUrl != null && !overlordUrl.isEmpty(), CONF_DRUID_OVERLORD_HOSTPORT + " not defined"); logger.info("druid overlord url = " + overlordUrl); String schemaStr = jobConf.get(CONF_DRUID_SCHEMA); Preconditions.checkArgument(schemaStr != null && !schemaStr.isEmpty(), "schema undefined, provide " + CONF_DRUID_SCHEMA); logger.info("schema = " + schemaStr); DatasourceIngestionSpec ingestionSpec = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(schemaStr, DatasourceIngestionSpec.class); String segmentsStr = getSegmentsToLoad(ingestionSpec.getDataSource(), ingestionSpec.getIntervals(), overlordUrl);//from w w w .j av a 2 s .co m logger.info("segments list received from overlord = " + segmentsStr); List<DataSegment> segmentsList = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<DataSegment>>() { }); VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>( Ordering.natural()); for (DataSegment segment : segmentsList) { timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment)); } final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline .lookup(ingestionSpec.getIntervals().get(0)); final List<WindowedDataSegment> windowedSegments = new ArrayList<>(); for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) { for (PartitionChunk<DataSegment> chunk : holder.getObject()) { windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval())); } } jobConf.set(CONF_INPUT_SEGMENTS, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(windowedSegments)); segmentsStr = Preconditions.checkNotNull(jobConf.get(CONF_INPUT_SEGMENTS), "No segments found to read"); List<WindowedDataSegment> segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<WindowedDataSegment>>() { }); if (segments == null || segments.size() == 0) { throw new ISE("No segments found to read"); } logger.info("segments to read " + segmentsStr); long maxSize = numSplits; if (maxSize > 0) { // combining is to happen, let us sort the segments list by size so that // they // are combined appropriately Collections.sort(segments, new Comparator<WindowedDataSegment>() { @Override public int compare(WindowedDataSegment s1, WindowedDataSegment s2) { return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize()); } }); } List<InputSplit> splits = Lists.newArrayList(); List<WindowedDataSegment> list = new ArrayList<>(); long size = 0; // JobConf dummyConf = new JobConf(); Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(jobContext); logger.info("dummyPath : " + paths); jobConf.set("druid.hive.dummyfilename", paths[0].toString()); InputFormat fio = supplier.get(); for (WindowedDataSegment segment : segments) { if (size + segment.getSegment().getSize() > maxSize && size > 0) { splits.add(toDataSourceSplit(list, fio, jobConf, paths[0])); list = Lists.newArrayList(); size = 0; } list.add(segment); size += segment.getSegment().getSize(); } if (list.size() > 0) { splits.add(toDataSourceSplit(list, fio, jobConf, paths[0])); } logger.info("Number of splits: " + splits.size()); for (InputSplit split : splits) { logger.info(split.getClass().getName()); for (String location : split.getLocations()) logger.info(location); } return Iterables.toArray(splits, InputSplit.class); }
From source file:com.yahoo.druid.hadoop.HiveDatasourceInputFormat.java
License:Apache License
private String[] getFrequentLocations(List<WindowedDataSegment> segments, InputFormat fio, JobConf conf) throws IOException { Iterable<String> locations = Collections.emptyList(); for (WindowedDataSegment segment : segments) { FileInputFormat.setInputPaths(conf, new Path(JobHelper.getURIFromSegment(segment.getSegment()))); logger.info("CheckPost 4" + fio.getClass()); for (InputSplit split : fio.getSplits(conf, 1)) { locations = Iterables.concat(locations, Arrays.asList(split.getLocations())); }//from w ww . ja v a 2 s . c om } return getFrequentLocations(locations); }
From source file:edu.uci.ics.asterix.external.indexing.dataflow.IndexingScheduler.java
License:Apache License
/** * Scan the splits once and build a popularity map * /*from w w w . ja va 2 s . c om*/ * @param splits * the split array * @param locationToNumOfSplits * the map to be built * @throws IOException */ private void buildPopularityMap(InputSplit[] splits, Map<String, IntWritable> locationToNumOfSplits) throws IOException { for (InputSplit split : splits) { String[] locations = split.getLocations(); for (String loc : locations) { IntWritable locCount = locationToNumOfSplits.get(loc); if (locCount == null) { locCount = new IntWritable(0); locationToNumOfSplits.put(loc, locCount); } locCount.set(locCount.get() + 1); } } }
From source file:edu.uci.ics.hyracks.hdfs.scheduler.IPProximityNcCollectionBuilder.java
License:Apache License
@Override public INcCollection build(Map<String, NodeControllerInfo> ncNameToNcInfos, final Map<String, List<String>> ipToNcMapping, final Map<String, Integer> ncNameToIndex, String[] NCs, final int[] workloads, final int slotLimit) { final TreeMap<BytesWritable, IntWritable> availableIpsToSlots = new TreeMap<BytesWritable, IntWritable>(); for (int i = 0; i < workloads.length; i++) { if (workloads[i] < slotLimit) { byte[] rawip; try { rawip = ncNameToNcInfos.get(NCs[i]).getNetworkAddress().lookupIpAddress(); } catch (UnknownHostException e) { // QQQ Should probably have a neater solution than this throw new RuntimeException(e); }/*from w w w. ja va 2 s. co m*/ BytesWritable ip = new BytesWritable(rawip); IntWritable availableSlot = availableIpsToSlots.get(ip); if (availableSlot == null) { availableSlot = new IntWritable(slotLimit - workloads[i]); availableIpsToSlots.put(ip, availableSlot); } else { availableSlot.set(slotLimit - workloads[i] + availableSlot.get()); } } } return new INcCollection() { @Override public String findNearestAvailableSlot(InputSplit split) { try { String[] locs = split.getLocations(); int minDistance = Integer.MAX_VALUE; BytesWritable currentCandidateIp = null; if (locs == null || locs.length > 0) { for (int j = 0; j < locs.length; j++) { /** * get all the IP addresses from the name */ InetAddress[] allIps = InetAddress.getAllByName(locs[j]); for (InetAddress ip : allIps) { BytesWritable splitIp = new BytesWritable(ip.getAddress()); /** * if the node controller exists */ BytesWritable candidateNcIp = availableIpsToSlots.floorKey(splitIp); if (candidateNcIp == null) { candidateNcIp = availableIpsToSlots.ceilingKey(splitIp); } if (candidateNcIp != null) { if (availableIpsToSlots.get(candidateNcIp).get() > 0) { byte[] candidateIP = candidateNcIp.getBytes(); byte[] splitIP = splitIp.getBytes(); int candidateInt = candidateIP[0] << 24 | (candidateIP[1] & 0xFF) << 16 | (candidateIP[2] & 0xFF) << 8 | (candidateIP[3] & 0xFF); int splitInt = splitIP[0] << 24 | (splitIP[1] & 0xFF) << 16 | (splitIP[2] & 0xFF) << 8 | (splitIP[3] & 0xFF); int distance = Math.abs(candidateInt - splitInt); if (minDistance > distance) { minDistance = distance; currentCandidateIp = candidateNcIp; } } } } } } else { for (Entry<BytesWritable, IntWritable> entry : availableIpsToSlots.entrySet()) { if (entry.getValue().get() > 0) { currentCandidateIp = entry.getKey(); break; } } } if (currentCandidateIp != null) { /** * Update the entry of the selected IP */ IntWritable availableSlot = availableIpsToSlots.get(currentCandidateIp); availableSlot.set(availableSlot.get() - 1); if (availableSlot.get() == 0) { availableIpsToSlots.remove(currentCandidateIp); } /** * Update the entry of the selected NC */ List<String> dataLocations = ipToNcMapping .get(InetAddress.getByAddress(currentCandidateIp.getBytes()).getHostAddress()); for (String nc : dataLocations) { int ncIndex = ncNameToIndex.get(nc); if (workloads[ncIndex] < slotLimit) { return nc; } } } /** not scheduled */ return null; } catch (Exception e) { throw new IllegalStateException(e); } } @Override public int numAvailableSlots() { return availableIpsToSlots.size(); } }; }
From source file:edu.uci.ics.hyracks.hdfs.scheduler.RackAwareNcCollectionBuilder.java
License:Apache License
@Override public INcCollection build(Map<String, NodeControllerInfo> ncNameToNcInfos, final Map<String, List<String>> ipToNcMapping, final Map<String, Integer> ncNameToIndex, String[] NCs, final int[] workloads, final int slotLimit) { try {/*from w w w . j a v a 2 s . c o m*/ final Map<List<Integer>, List<String>> pathToNCs = new HashMap<List<Integer>, List<String>>(); for (int i = 0; i < NCs.length; i++) { List<Integer> path = new ArrayList<Integer>(); String ipAddress = InetAddress .getByAddress(ncNameToNcInfos.get(NCs[i]).getNetworkAddress().lookupIpAddress()) .getHostAddress(); topology.lookupNetworkTerminal(ipAddress, path); if (path.size() <= 0) { // if the hyracks nc is not in the defined cluster path.add(Integer.MIN_VALUE); LOGGER.info(NCs[i] + "'s IP address is not in the cluster toplogy file!"); } List<String> ncs = pathToNCs.get(path); if (ncs == null) { ncs = new ArrayList<String>(); pathToNCs.put(path, ncs); } ncs.add(NCs[i]); } final TreeMap<List<Integer>, IntWritable> availableIpsToSlots = new TreeMap<List<Integer>, IntWritable>( new Comparator<List<Integer>>() { @Override public int compare(List<Integer> l1, List<Integer> l2) { int commonLength = Math.min(l1.size(), l2.size()); for (int i = 0; i < commonLength; i++) { Integer value1 = l1.get(i); Integer value2 = l2.get(i); int cmp = value1 > value2 ? 1 : (value1 < value2 ? -1 : 0); if (cmp != 0) { return cmp; } } return l1.size() > l2.size() ? 1 : (l1.size() < l2.size() ? -1 : 0); } }); for (int i = 0; i < workloads.length; i++) { if (workloads[i] < slotLimit) { List<Integer> path = new ArrayList<Integer>(); String ipAddress = InetAddress .getByAddress(ncNameToNcInfos.get(NCs[i]).getNetworkAddress().lookupIpAddress()) .getHostAddress(); topology.lookupNetworkTerminal(ipAddress, path); if (path.size() <= 0) { // if the hyracks nc is not in the defined cluster path.add(Integer.MIN_VALUE); } IntWritable availableSlot = availableIpsToSlots.get(path); if (availableSlot == null) { availableSlot = new IntWritable(slotLimit - workloads[i]); availableIpsToSlots.put(path, availableSlot); } else { availableSlot.set(slotLimit - workloads[i] + availableSlot.get()); } } } return new INcCollection() { @Override public String findNearestAvailableSlot(InputSplit split) { try { String[] locs = split.getLocations(); int minDistance = Integer.MAX_VALUE; List<Integer> currentCandidatePath = null; if (locs == null || locs.length > 0) { for (int j = 0; j < locs.length; j++) { /** * get all the IP addresses from the name */ InetAddress[] allIps = InetAddress.getAllByName(locs[j]); boolean inTopology = false; for (InetAddress ip : allIps) { List<Integer> splitPath = new ArrayList<Integer>(); boolean inCluster = topology.lookupNetworkTerminal(ip.getHostAddress(), splitPath); if (!inCluster) { continue; } inTopology = true; /** * if the node controller exists */ List<Integer> candidatePath = availableIpsToSlots.floorKey(splitPath); if (candidatePath == null) { candidatePath = availableIpsToSlots.ceilingKey(splitPath); } if (candidatePath != null) { if (availableIpsToSlots.get(candidatePath).get() > 0) { int distance = distance(splitPath, candidatePath); if (minDistance > distance) { minDistance = distance; currentCandidatePath = candidatePath; } } } } if (!inTopology) { LOGGER.info(locs[j] + "'s IP address is not in the cluster toplogy file!"); /** * if the machine is not in the toplogy file */ List<Integer> candidatePath = null; for (Entry<List<Integer>, IntWritable> entry : availableIpsToSlots.entrySet()) { if (entry.getValue().get() > 0) { candidatePath = entry.getKey(); break; } } /** the split path is empty */ if (candidatePath != null) { if (availableIpsToSlots.get(candidatePath).get() > 0) { currentCandidatePath = candidatePath; } } } } } else { for (Entry<List<Integer>, IntWritable> entry : availableIpsToSlots.entrySet()) { if (entry.getValue().get() > 0) { currentCandidatePath = entry.getKey(); break; } } } if (currentCandidatePath != null && currentCandidatePath.size() > 0) { /** * Update the entry of the selected IP */ IntWritable availableSlot = availableIpsToSlots.get(currentCandidatePath); availableSlot.set(availableSlot.get() - 1); if (availableSlot.get() == 0) { availableIpsToSlots.remove(currentCandidatePath); } /** * Update the entry of the selected NC */ List<String> candidateNcs = pathToNCs.get(currentCandidatePath); for (String candidate : candidateNcs) { int ncIndex = ncNameToIndex.get(candidate); if (workloads[ncIndex] < slotLimit) { return candidate; } } } /** not scheduled */ return null; } catch (Exception e) { throw new IllegalStateException(e); } } @Override public int numAvailableSlots() { return availableIpsToSlots.size(); } private int distance(List<Integer> splitPath, List<Integer> candidatePath) { int commonLength = Math.min(splitPath.size(), candidatePath.size()); int distance = 0; for (int i = 0; i < commonLength; i++) { distance = distance * 100 + Math.abs(splitPath.get(i) - candidatePath.get(i)); } List<Integer> restElements = splitPath.size() > candidatePath.size() ? splitPath : candidatePath; for (int i = commonLength; i < restElements.size(); i++) { distance = distance * 100 + Math.abs(restElements.get(i)); } return distance; } }; } catch (Exception e) { throw new IllegalStateException(e); } }
From source file:org.apache.drill.exec.store.hive.HiveInputReader.java
License:Apache License
public static void main(String args[]) throws Exception { /*//from w ww . j a v a 2 s . c o m String[] columnNames = {"n_nationkey", "n_name", "n_regionkey", "n_comment"}; String[] columnTypes = {"bigint", "string", "bigint", "string"}; List<FieldSchema> cols = Lists.newArrayList(); for (int i = 0; i < columnNames.length; i++) { cols.add(new FieldSchema(columnNames[i], columnTypes[i], null)); } String location = "file:///tmp/nation_s"; String inputFormat = TextInputFormat.class.getCanonicalName(); String serdeLib = LazySimpleSerDe.class.getCanonicalName(); // String inputFormat = HiveHBaseTableInputFormat.class.getCanonicalName(); // String serdeLib = HBaseSerDe.class.getCanonicalName(); Map<String, String> serdeParams = new HashMap(); // serdeParams.put("serialization.format", "1"); // serdeParams.put("hbase.columns.mapping", ":key,f:name,f:regionkey,f:comment"); serdeParams.put("serialization.format", "|"); serdeParams.put("field.delim", "|"); Map<String, String> tableParams = new HashMap(); tableParams.put("hbase.table.name", "nation"); SerDeInfo serDeInfo = new SerDeInfo(null, serdeLib, serdeParams); StorageDescriptor storageDescriptor = new StorageDescriptor(cols, location, inputFormat, null, false, -1, serDeInfo, null, null, null); Table table = new Table("table", "default", "sphillips", 0, 0, 0, storageDescriptor, new ArrayList<FieldSchema>(), tableParams, null, null, "MANAGED_TABLE"); Properties properties = MetaStoreUtils.getTableMetadata(table); */ HiveConf conf = new HiveConf(); conf.set("hive.metastore.uris", "thrift://10.10.31.51:9083"); HiveMetaStoreClient client = new HiveMetaStoreClient(conf); Table table = client.getTable("default", "nation"); Properties properties = MetaStoreUtils.getTableMetadata(table); Path path = new Path(table.getSd().getLocation()); JobConf job = new JobConf(); for (Object obj : properties.keySet()) { job.set((String) obj, (String) properties.get(obj)); } // job.set("hbase.zookeeper.quorum", "10.10.31.51"); // job.set("hbase.zookeeper.property.clientPort", "5181"); InputFormat f = (InputFormat) Class.forName(table.getSd().getInputFormat()).getConstructor().newInstance(); job.setInputFormat(f.getClass()); FileInputFormat.addInputPath(job, path); InputFormat format = job.getInputFormat(); SerDe serde = (SerDe) Class.forName(table.getSd().getSerdeInfo().getSerializationLib()).getConstructor() .newInstance(); serde.initialize(job, properties); ObjectInspector inspector = serde.getObjectInspector(); ObjectInspector.Category cat = inspector.getCategory(); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(inspector); List<String> columns = null; List<TypeInfo> colTypes = null; List<ObjectInspector> fieldObjectInspectors = Lists.newArrayList(); switch (typeInfo.getCategory()) { case STRUCT: columns = ((StructTypeInfo) typeInfo).getAllStructFieldNames(); colTypes = ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos(); for (int i = 0; i < columns.size(); i++) { System.out.print(columns.get(i)); System.out.print(" "); System.out.print(colTypes.get(i)); } System.out.println(""); for (StructField field : ((StructObjectInspector) inspector).getAllStructFieldRefs()) { fieldObjectInspectors.add(field.getFieldObjectInspector()); } } for (InputSplit split : format.getSplits(job, 1)) { String encoded = serializeInputSplit(split); System.out.println(encoded); InputSplit newSplit = deserializeInputSplit(encoded, split.getClass().getCanonicalName()); System.out.print("Length: " + newSplit.getLength() + " "); System.out.print("Locations: "); for (String loc : newSplit.getLocations()) System.out.print(loc + " "); System.out.println(); } for (InputSplit split : format.getSplits(job, 1)) { RecordReader reader = format.getRecordReader(split, job, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int count = 0; while (reader.next(key, value)) { List<Object> values = ((StructObjectInspector) inspector) .getStructFieldsDataAsList(serde.deserialize((Writable) value)); StructObjectInspector sInsp = (StructObjectInspector) inspector; Object obj = sInsp.getStructFieldData(serde.deserialize((Writable) value), sInsp.getStructFieldRef("n_name")); System.out.println(obj); /* for (Object obj : values) { PrimitiveObjectInspector.PrimitiveCategory pCat = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveCategory(); Object pObj = ((PrimitiveObjectInspector)fieldObjectInspectors.get(count)).getPrimitiveJavaObject(obj); System.out.print(pObj + " "); } */ System.out.println(""); } } }
From source file:org.apache.hive.jdbc.BaseJdbcWithMiniLlap.java
License:Apache License
private int processQuery(String currentDatabase, String query, int numSplits, RowProcessor rowProcessor) throws Exception { String url = miniHS2.getJdbcURL(); String user = System.getProperty("user.name"); String pwd = user;/*from w w w .j a v a 2 s . c o m*/ String handleId = UUID.randomUUID().toString(); InputFormat<NullWritable, Row> inputFormat = getInputFormat(); // Get splits JobConf job = new JobConf(conf); job.set(LlapBaseInputFormat.URL_KEY, url); job.set(LlapBaseInputFormat.USER_KEY, user); job.set(LlapBaseInputFormat.PWD_KEY, pwd); job.set(LlapBaseInputFormat.QUERY_KEY, query); job.set(LlapBaseInputFormat.HANDLE_ID, handleId); if (currentDatabase != null) { job.set(LlapBaseInputFormat.DB_KEY, currentDatabase); } InputSplit[] splits = inputFormat.getSplits(job, numSplits); assertTrue(splits.length > 0); // Fetch rows from splits boolean first = true; int rowCount = 0; for (InputSplit split : splits) { System.out.println("Processing split " + split.getLocations()); int numColumns = 2; RecordReader<NullWritable, Row> reader = inputFormat.getRecordReader(split, job, null); Row row = reader.createValue(); while (reader.next(NullWritable.get(), row)) { rowProcessor.process(row); ++rowCount; } //In arrow-mode this will throw exception unless all buffers have been released //See org.apache.hadoop.hive.llap.LlapArrowBatchRecordReader reader.close(); } LlapBaseInputFormat.close(handleId); return rowCount; }