List of usage examples for org.apache.hadoop.mapreduce Job getJobID
public JobID getJobID()
From source file:org.wonderbee.elasticsearch.hive.ElasticSearchHiveInputFormat.java
License:Apache License
/** The number of splits is specified in the Hadoop configuration object. *///from ww w . j a v a2s . c o m public InputSplit[] getSplits(JobConf conf, int numSplitsHint) throws IOException { this.conf = conf; this.indexName = conf.get(ES_INDEX_NAME); this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE, "1000")); this.hostPort = conf.get(ElasticSearchStorageHandler.ES_HOSTPORT); System.setProperty(ES_CONFIG, conf.get(ES_CONFIG)); System.setProperty(ES_PLUGINS, conf.get(ES_PLUGINS)); start_embedded_client(); LOG.info("Admin client started"); //Get the mapping of shards to node/hosts with primary ClusterState clusterState = client.admin().cluster().prepareState().execute().actionGet().state(); Map<Integer, String[]> shardToHost = new LinkedHashMap<Integer, String[]>(); for (IndexRoutingTable indexRoutingTable : clusterState.routingTable()) { for (IndexShardRoutingTable indexShardRoutingTable : indexRoutingTable) { for (ShardRouting shardRouting : indexShardRoutingTable.getAssignedShards()) { if (shardRouting.shardId().index().getName().equals(this.indexName) && shardRouting.primary()) { InetSocketTransportAddress address = (InetSocketTransportAddress) clusterState.nodes() .get(shardRouting.currentNodeId()).getAddress(); int shardId = shardRouting.shardId().getId(); String hostPort = address.address().getHostName() + ":" + address.address().getPort(); String nodeName = shardRouting.currentNodeId(); shardToHost.put(shardId, new String[] { hostPort, nodeName }); } } } } this.client.close(); LOG.info("Admin client closed"); List<InputSplit> splits = new ArrayList<InputSplit>(shardToHost.size()); Job job = new Job(conf); JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID()); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); //Consultation with kimchy revealed it should be more efficient to just have as many splits //as shards. for (Map.Entry<Integer, String[]> pair : shardToHost.entrySet()) { int shard = pair.getKey(); String shardHostPort = pair.getValue()[0]; String nodeName = pair.getValue()[1]; LOG.debug("Created split: shard:" + shard + ", host:" + shardHostPort + ", node:" + nodeName); splits.add(new HiveInputFormat.HiveInputSplit(new ElasticSearchSplit(0, this.requestSize, shardHostPort, nodeName, shard, tablePaths[0].toString()), "ElasticSearchSplit")); } return splits.toArray(new InputSplit[splits.size()]); }