List of usage examples for com.google.common.collect TreeMultiset count
@Override public int count(@Nullable Object element)
From source file:com.metamx.druid.merger.common.task.IndexDeterminePartitionsTask.java
@Override public TaskStatus run(TaskContext context, TaskToolbox toolbox) throws Exception { log.info("Running with targetPartitionSize[%d]", targetPartitionSize); // This is similar to what DeterminePartitionsJob does in the hadoop indexer, but we don't require // a preconfigured partition dimension (we'll just pick the one with highest cardinality). // XXX - Space-efficiency (stores all unique dimension values, although at least not all combinations) // XXX - Time-efficiency (runs all this on one single node instead of through map/reduce) // Blacklist dimensions that have multiple values per row final Set<String> unusableDimensions = Sets.newHashSet(); // Track values of all non-blacklisted dimensions final Map<String, TreeMultiset<String>> dimensionValueMultisets = Maps.newHashMap(); // Load data/*from w w w . j ava 2 s. co m*/ final Firehose firehose = firehoseFactory.connect(); try { while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); if (getInterval().contains(inputRow.getTimestampFromEpoch())) { // Extract dimensions from event for (final String dim : inputRow.getDimensions()) { final List<String> dimValues = inputRow.getDimension(dim); if (!unusableDimensions.contains(dim)) { if (dimValues.size() == 1) { // Track this value TreeMultiset<String> dimensionValueMultiset = dimensionValueMultisets.get(dim); if (dimensionValueMultiset == null) { dimensionValueMultiset = TreeMultiset.create(); dimensionValueMultisets.put(dim, dimensionValueMultiset); } dimensionValueMultiset.add(dimValues.get(0)); } else { // Only single-valued dimensions can be used for partitions unusableDimensions.add(dim); dimensionValueMultisets.remove(dim); } } } } } } finally { firehose.close(); } // ShardSpecs for index generator tasks final List<ShardSpec> shardSpecs = Lists.newArrayList(); // Select highest-cardinality dimension Ordering<Map.Entry<String, TreeMultiset<String>>> byCardinalityOrdering = new Ordering<Map.Entry<String, TreeMultiset<String>>>() { @Override public int compare(Map.Entry<String, TreeMultiset<String>> left, Map.Entry<String, TreeMultiset<String>> right) { return Ints.compare(left.getValue().elementSet().size(), right.getValue().elementSet().size()); } }; if (dimensionValueMultisets.isEmpty()) { // No suitable partition dimension. We'll make one big segment and hope for the best. log.info("No suitable partition dimension found"); shardSpecs.add(new NoneShardSpec()); } else { // Find best partition dimension (heuristic: highest cardinality). final Map.Entry<String, TreeMultiset<String>> partitionEntry = byCardinalityOrdering .max(dimensionValueMultisets.entrySet()); final String partitionDim = partitionEntry.getKey(); final TreeMultiset<String> partitionDimValues = partitionEntry.getValue(); log.info("Partitioning on dimension[%s] with cardinality[%d] over rows[%d]", partitionDim, partitionDimValues.elementSet().size(), partitionDimValues.size()); // Iterate over unique partition dimension values in sorted order String currentPartitionStart = null; int currentPartitionSize = 0; for (final String partitionDimValue : partitionDimValues.elementSet()) { currentPartitionSize += partitionDimValues.count(partitionDimValue); if (currentPartitionSize >= targetPartitionSize) { final ShardSpec shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, partitionDimValue, shardSpecs.size()); log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); currentPartitionSize = partitionDimValues.count(partitionDimValue); currentPartitionStart = partitionDimValue; } } if (currentPartitionSize > 0) { // One last shard to go final ShardSpec shardSpec; if (shardSpecs.isEmpty()) { shardSpec = new NoneShardSpec(); } else { shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, null, shardSpecs.size()); } log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); } } return TaskStatus.continued(getId(), Lists.transform(shardSpecs, new Function<ShardSpec, Task>() { @Override public Task apply(ShardSpec shardSpec) { return new IndexGeneratorTask(getGroupId(), getInterval(), firehoseFactory, new Schema( schema.getDataSource(), schema.getAggregators(), schema.getIndexGranularity(), shardSpec)); } })); }
From source file:io.druid.indexing.common.task.IndexDeterminePartitionsTask.java
@Override public TaskStatus run(TaskToolbox toolbox) throws Exception { log.info("Running with targetPartitionSize[%d]", targetPartitionSize); // The implementation of this determine partitions stuff is less than optimal. Should be done better. // We know this exists final Interval interval = getImplicitLockInterval().get(); // Blacklist dimensions that have multiple values per row final Set<String> unusableDimensions = Sets.newHashSet(); // Track values of all non-blacklisted dimensions final Map<String, TreeMultiset<String>> dimensionValueMultisets = Maps.newHashMap(); // Load data//from w ww . j a va 2 s .com final Firehose firehose = firehoseFactory.connect(); try { while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); if (interval.contains(inputRow.getTimestampFromEpoch())) { // Extract dimensions from event for (final String dim : inputRow.getDimensions()) { final List<String> dimValues = inputRow.getDimension(dim); if (!unusableDimensions.contains(dim)) { if (dimValues.size() == 1) { // Track this value TreeMultiset<String> dimensionValueMultiset = dimensionValueMultisets.get(dim); if (dimensionValueMultiset == null) { dimensionValueMultiset = TreeMultiset.create(); dimensionValueMultisets.put(dim, dimensionValueMultiset); } dimensionValueMultiset.add(dimValues.get(0)); } else { // Only single-valued dimensions can be used for partitions unusableDimensions.add(dim); dimensionValueMultisets.remove(dim); } } } } } } finally { firehose.close(); } // ShardSpecs for index generator tasks final List<ShardSpec> shardSpecs = Lists.newArrayList(); // Select highest-cardinality dimension Ordering<Map.Entry<String, TreeMultiset<String>>> byCardinalityOrdering = new Ordering<Map.Entry<String, TreeMultiset<String>>>() { @Override public int compare(Map.Entry<String, TreeMultiset<String>> left, Map.Entry<String, TreeMultiset<String>> right) { return Ints.compare(left.getValue().elementSet().size(), right.getValue().elementSet().size()); } }; if (dimensionValueMultisets.isEmpty()) { // No suitable partition dimension. We'll make one big segment and hope for the best. log.info("No suitable partition dimension found"); shardSpecs.add(new NoneShardSpec()); } else { // Find best partition dimension (heuristic: highest cardinality). final Map.Entry<String, TreeMultiset<String>> partitionEntry = byCardinalityOrdering .max(dimensionValueMultisets.entrySet()); final String partitionDim = partitionEntry.getKey(); final TreeMultiset<String> partitionDimValues = partitionEntry.getValue(); log.info("Partitioning on dimension[%s] with cardinality[%d] over rows[%d]", partitionDim, partitionDimValues.elementSet().size(), partitionDimValues.size()); // Iterate over unique partition dimension values in sorted order String currentPartitionStart = null; int currentPartitionSize = 0; for (final String partitionDimValue : partitionDimValues.elementSet()) { currentPartitionSize += partitionDimValues.count(partitionDimValue); if (currentPartitionSize >= targetPartitionSize) { final ShardSpec shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, partitionDimValue, shardSpecs.size()); log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); currentPartitionSize = partitionDimValues.count(partitionDimValue); currentPartitionStart = partitionDimValue; } } if (currentPartitionSize > 0) { // One last shard to go final ShardSpec shardSpec; if (shardSpecs.isEmpty()) { shardSpec = new NoneShardSpec(); } else { shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, null, shardSpecs.size()); } log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); } } List<Task> nextTasks = Lists.transform(shardSpecs, new Function<ShardSpec, Task>() { @Override public Task apply(ShardSpec shardSpec) { return new IndexGeneratorTask(null, getGroupId(), getImplicitLockInterval().get(), firehoseFactory, new Schema(schema.getDataSource(), schema.getSpatialDimensions(), schema.getAggregators(), schema.getIndexGranularity(), shardSpec), rowFlushBoundary); } }); toolbox.getTaskActionClient().submit(new SpawnTasksAction(nextTasks)); return TaskStatus.success(getId()); }
From source file:com.metamx.druid.indexing.common.task.IndexDeterminePartitionsTask.java
@Override public TaskStatus run(TaskToolbox toolbox) throws Exception { log.info("Running with targetPartitionSize[%d]", targetPartitionSize); // TODO: Replace/merge/whatever with hadoop determine-partitions code // We know this exists final Interval interval = getImplicitLockInterval().get(); // Blacklist dimensions that have multiple values per row final Set<String> unusableDimensions = Sets.newHashSet(); // Track values of all non-blacklisted dimensions final Map<String, TreeMultiset<String>> dimensionValueMultisets = Maps.newHashMap(); // Load data/*from www . j a va 2 s. c o m*/ final Firehose firehose = firehoseFactory.connect(); try { while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); if (interval.contains(inputRow.getTimestampFromEpoch())) { // Extract dimensions from event for (final String dim : inputRow.getDimensions()) { final List<String> dimValues = inputRow.getDimension(dim); if (!unusableDimensions.contains(dim)) { if (dimValues.size() == 1) { // Track this value TreeMultiset<String> dimensionValueMultiset = dimensionValueMultisets.get(dim); if (dimensionValueMultiset == null) { dimensionValueMultiset = TreeMultiset.create(); dimensionValueMultisets.put(dim, dimensionValueMultiset); } dimensionValueMultiset.add(dimValues.get(0)); } else { // Only single-valued dimensions can be used for partitions unusableDimensions.add(dim); dimensionValueMultisets.remove(dim); } } } } } } finally { firehose.close(); } // ShardSpecs for index generator tasks final List<ShardSpec> shardSpecs = Lists.newArrayList(); // Select highest-cardinality dimension Ordering<Map.Entry<String, TreeMultiset<String>>> byCardinalityOrdering = new Ordering<Map.Entry<String, TreeMultiset<String>>>() { @Override public int compare(Map.Entry<String, TreeMultiset<String>> left, Map.Entry<String, TreeMultiset<String>> right) { return Ints.compare(left.getValue().elementSet().size(), right.getValue().elementSet().size()); } }; if (dimensionValueMultisets.isEmpty()) { // No suitable partition dimension. We'll make one big segment and hope for the best. log.info("No suitable partition dimension found"); shardSpecs.add(new NoneShardSpec()); } else { // Find best partition dimension (heuristic: highest cardinality). final Map.Entry<String, TreeMultiset<String>> partitionEntry = byCardinalityOrdering .max(dimensionValueMultisets.entrySet()); final String partitionDim = partitionEntry.getKey(); final TreeMultiset<String> partitionDimValues = partitionEntry.getValue(); log.info("Partitioning on dimension[%s] with cardinality[%d] over rows[%d]", partitionDim, partitionDimValues.elementSet().size(), partitionDimValues.size()); // Iterate over unique partition dimension values in sorted order String currentPartitionStart = null; int currentPartitionSize = 0; for (final String partitionDimValue : partitionDimValues.elementSet()) { currentPartitionSize += partitionDimValues.count(partitionDimValue); if (currentPartitionSize >= targetPartitionSize) { final ShardSpec shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, partitionDimValue, shardSpecs.size()); log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); currentPartitionSize = partitionDimValues.count(partitionDimValue); currentPartitionStart = partitionDimValue; } } if (currentPartitionSize > 0) { // One last shard to go final ShardSpec shardSpec; if (shardSpecs.isEmpty()) { shardSpec = new NoneShardSpec(); } else { shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, null, shardSpecs.size()); } log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); } } List<Task> nextTasks = Lists.transform(shardSpecs, new Function<ShardSpec, Task>() { @Override public Task apply(ShardSpec shardSpec) { return new IndexGeneratorTask(null, getGroupId(), getImplicitLockInterval().get(), firehoseFactory, new Schema(schema.getDataSource(), schema.getSpatialDimensions(), schema.getAggregators(), schema.getIndexGranularity(), shardSpec), rowFlushBoundary); } }); toolbox.getTaskActionClient().submit(new SpawnTasksAction(nextTasks)); return TaskStatus.success(getId()); }
From source file:org.apache.tez.analyzer.plugins.TaskConcurrencyAnalyzer.java
@Override public void analyze(DagInfo dagInfo) throws TezException { //For each vertex find the concurrent tasks running at any point for (VertexInfo vertexInfo : dagInfo.getVertices()) { List<TaskAttemptInfo> taskAttempts = Lists.newLinkedList(vertexInfo.getTaskAttempts(true, null)); String vertexName = vertexInfo.getVertexName(); /**//from w ww . jav a 2s .co m * - Get sorted multi-set of timestamps (S1, S2,...E1, E2..). Possible to have multiple * tasks starting/ending at same time. * - Walk through the set * - Increment concurrent tasks when start event is encountered * - Decrement concurrent tasks when start event is encountered */ TreeMultiset<TimeInfo> timeInfoSet = TreeMultiset.create(new Comparator<TimeInfo>() { @Override public int compare(TimeInfo o1, TimeInfo o2) { if (o1.timestamp < o2.timestamp) { return -1; } if (o1.timestamp > o2.timestamp) { return 1; } if (o1.timestamp == o2.timestamp) { //check event type if (o1.eventType.equals(o2.eventType)) { return 0; } if (o1.eventType.equals(EventType.START) && o2.eventType.equals(EventType.FINISH)) { return -1; } else { return 1; } } return 0; } }); for (TaskAttemptInfo attemptInfo : taskAttempts) { TimeInfo startTimeInfo = new TimeInfo(EventType.START, attemptInfo.getStartTime()); TimeInfo stopTimeInfo = new TimeInfo(EventType.FINISH, attemptInfo.getFinishTime()); timeInfoSet.add(startTimeInfo); timeInfoSet.add(stopTimeInfo); } //Compute concurrent tasks in the list now. int concurrentTasks = 0; for (TimeInfo timeInfo : timeInfoSet.elementSet()) { switch (timeInfo.eventType) { case START: concurrentTasks += timeInfoSet.count(timeInfo); break; case FINISH: concurrentTasks -= timeInfoSet.count(timeInfo); break; default: break; } timeInfo.concurrentTasks = concurrentTasks; addToResult(vertexName, timeInfo.timestamp, timeInfo.concurrentTasks); } } }