List of usage examples for com.google.common.collect TreeMultiset size
@Override public int size()
From source file:org.apache.mahout.math.neighborhood.ProjectionSearch.java
/** * Adds a WeightedVector into the set of projections for later searching. * @param vector The WeightedVector to add. *//*from ww w . j ava 2s . c o m*/ @Override public void add(Vector vector) { initialize(vector.size()); Vector projection = basisMatrix.times(vector); // Add the the new vector and the projected distance to each set separately. int i = 0; for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { s.add(new WeightedThing<Vector>(vector, projection.get(i++))); } int numVectors = scalarProjections.get(0).size(); for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { Preconditions.checkArgument(s.size() == numVectors, "Number of vectors in projection sets " + "differ"); double firstWeight = s.firstEntry().getElement().getWeight(); for (WeightedThing<Vector> w : s) { Preconditions.checkArgument(firstWeight <= w.getWeight(), "Weights not in non-decreasing " + "order"); firstWeight = w.getWeight(); } } }
From source file:com.metamx.druid.merger.common.task.IndexDeterminePartitionsTask.java
@Override public TaskStatus run(TaskContext context, TaskToolbox toolbox) throws Exception { log.info("Running with targetPartitionSize[%d]", targetPartitionSize); // This is similar to what DeterminePartitionsJob does in the hadoop indexer, but we don't require // a preconfigured partition dimension (we'll just pick the one with highest cardinality). // XXX - Space-efficiency (stores all unique dimension values, although at least not all combinations) // XXX - Time-efficiency (runs all this on one single node instead of through map/reduce) // Blacklist dimensions that have multiple values per row final Set<String> unusableDimensions = Sets.newHashSet(); // Track values of all non-blacklisted dimensions final Map<String, TreeMultiset<String>> dimensionValueMultisets = Maps.newHashMap(); // Load data// w w w . j a va 2 s . c o m final Firehose firehose = firehoseFactory.connect(); try { while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); if (getInterval().contains(inputRow.getTimestampFromEpoch())) { // Extract dimensions from event for (final String dim : inputRow.getDimensions()) { final List<String> dimValues = inputRow.getDimension(dim); if (!unusableDimensions.contains(dim)) { if (dimValues.size() == 1) { // Track this value TreeMultiset<String> dimensionValueMultiset = dimensionValueMultisets.get(dim); if (dimensionValueMultiset == null) { dimensionValueMultiset = TreeMultiset.create(); dimensionValueMultisets.put(dim, dimensionValueMultiset); } dimensionValueMultiset.add(dimValues.get(0)); } else { // Only single-valued dimensions can be used for partitions unusableDimensions.add(dim); dimensionValueMultisets.remove(dim); } } } } } } finally { firehose.close(); } // ShardSpecs for index generator tasks final List<ShardSpec> shardSpecs = Lists.newArrayList(); // Select highest-cardinality dimension Ordering<Map.Entry<String, TreeMultiset<String>>> byCardinalityOrdering = new Ordering<Map.Entry<String, TreeMultiset<String>>>() { @Override public int compare(Map.Entry<String, TreeMultiset<String>> left, Map.Entry<String, TreeMultiset<String>> right) { return Ints.compare(left.getValue().elementSet().size(), right.getValue().elementSet().size()); } }; if (dimensionValueMultisets.isEmpty()) { // No suitable partition dimension. We'll make one big segment and hope for the best. log.info("No suitable partition dimension found"); shardSpecs.add(new NoneShardSpec()); } else { // Find best partition dimension (heuristic: highest cardinality). final Map.Entry<String, TreeMultiset<String>> partitionEntry = byCardinalityOrdering .max(dimensionValueMultisets.entrySet()); final String partitionDim = partitionEntry.getKey(); final TreeMultiset<String> partitionDimValues = partitionEntry.getValue(); log.info("Partitioning on dimension[%s] with cardinality[%d] over rows[%d]", partitionDim, partitionDimValues.elementSet().size(), partitionDimValues.size()); // Iterate over unique partition dimension values in sorted order String currentPartitionStart = null; int currentPartitionSize = 0; for (final String partitionDimValue : partitionDimValues.elementSet()) { currentPartitionSize += partitionDimValues.count(partitionDimValue); if (currentPartitionSize >= targetPartitionSize) { final ShardSpec shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, partitionDimValue, shardSpecs.size()); log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); currentPartitionSize = partitionDimValues.count(partitionDimValue); currentPartitionStart = partitionDimValue; } } if (currentPartitionSize > 0) { // One last shard to go final ShardSpec shardSpec; if (shardSpecs.isEmpty()) { shardSpec = new NoneShardSpec(); } else { shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, null, shardSpecs.size()); } log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); } } return TaskStatus.continued(getId(), Lists.transform(shardSpecs, new Function<ShardSpec, Task>() { @Override public Task apply(ShardSpec shardSpec) { return new IndexGeneratorTask(getGroupId(), getInterval(), firehoseFactory, new Schema( schema.getDataSource(), schema.getAggregators(), schema.getIndexGranularity(), shardSpec)); } })); }
From source file:io.druid.indexing.common.task.IndexDeterminePartitionsTask.java
@Override public TaskStatus run(TaskToolbox toolbox) throws Exception { log.info("Running with targetPartitionSize[%d]", targetPartitionSize); // The implementation of this determine partitions stuff is less than optimal. Should be done better. // We know this exists final Interval interval = getImplicitLockInterval().get(); // Blacklist dimensions that have multiple values per row final Set<String> unusableDimensions = Sets.newHashSet(); // Track values of all non-blacklisted dimensions final Map<String, TreeMultiset<String>> dimensionValueMultisets = Maps.newHashMap(); // Load data/*ww w . j av a 2 s . c o m*/ final Firehose firehose = firehoseFactory.connect(); try { while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); if (interval.contains(inputRow.getTimestampFromEpoch())) { // Extract dimensions from event for (final String dim : inputRow.getDimensions()) { final List<String> dimValues = inputRow.getDimension(dim); if (!unusableDimensions.contains(dim)) { if (dimValues.size() == 1) { // Track this value TreeMultiset<String> dimensionValueMultiset = dimensionValueMultisets.get(dim); if (dimensionValueMultiset == null) { dimensionValueMultiset = TreeMultiset.create(); dimensionValueMultisets.put(dim, dimensionValueMultiset); } dimensionValueMultiset.add(dimValues.get(0)); } else { // Only single-valued dimensions can be used for partitions unusableDimensions.add(dim); dimensionValueMultisets.remove(dim); } } } } } } finally { firehose.close(); } // ShardSpecs for index generator tasks final List<ShardSpec> shardSpecs = Lists.newArrayList(); // Select highest-cardinality dimension Ordering<Map.Entry<String, TreeMultiset<String>>> byCardinalityOrdering = new Ordering<Map.Entry<String, TreeMultiset<String>>>() { @Override public int compare(Map.Entry<String, TreeMultiset<String>> left, Map.Entry<String, TreeMultiset<String>> right) { return Ints.compare(left.getValue().elementSet().size(), right.getValue().elementSet().size()); } }; if (dimensionValueMultisets.isEmpty()) { // No suitable partition dimension. We'll make one big segment and hope for the best. log.info("No suitable partition dimension found"); shardSpecs.add(new NoneShardSpec()); } else { // Find best partition dimension (heuristic: highest cardinality). final Map.Entry<String, TreeMultiset<String>> partitionEntry = byCardinalityOrdering .max(dimensionValueMultisets.entrySet()); final String partitionDim = partitionEntry.getKey(); final TreeMultiset<String> partitionDimValues = partitionEntry.getValue(); log.info("Partitioning on dimension[%s] with cardinality[%d] over rows[%d]", partitionDim, partitionDimValues.elementSet().size(), partitionDimValues.size()); // Iterate over unique partition dimension values in sorted order String currentPartitionStart = null; int currentPartitionSize = 0; for (final String partitionDimValue : partitionDimValues.elementSet()) { currentPartitionSize += partitionDimValues.count(partitionDimValue); if (currentPartitionSize >= targetPartitionSize) { final ShardSpec shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, partitionDimValue, shardSpecs.size()); log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); currentPartitionSize = partitionDimValues.count(partitionDimValue); currentPartitionStart = partitionDimValue; } } if (currentPartitionSize > 0) { // One last shard to go final ShardSpec shardSpec; if (shardSpecs.isEmpty()) { shardSpec = new NoneShardSpec(); } else { shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, null, shardSpecs.size()); } log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); } } List<Task> nextTasks = Lists.transform(shardSpecs, new Function<ShardSpec, Task>() { @Override public Task apply(ShardSpec shardSpec) { return new IndexGeneratorTask(null, getGroupId(), getImplicitLockInterval().get(), firehoseFactory, new Schema(schema.getDataSource(), schema.getSpatialDimensions(), schema.getAggregators(), schema.getIndexGranularity(), shardSpec), rowFlushBoundary); } }); toolbox.getTaskActionClient().submit(new SpawnTasksAction(nextTasks)); return TaskStatus.success(getId()); }
From source file:com.metamx.druid.indexing.common.task.IndexDeterminePartitionsTask.java
@Override public TaskStatus run(TaskToolbox toolbox) throws Exception { log.info("Running with targetPartitionSize[%d]", targetPartitionSize); // TODO: Replace/merge/whatever with hadoop determine-partitions code // We know this exists final Interval interval = getImplicitLockInterval().get(); // Blacklist dimensions that have multiple values per row final Set<String> unusableDimensions = Sets.newHashSet(); // Track values of all non-blacklisted dimensions final Map<String, TreeMultiset<String>> dimensionValueMultisets = Maps.newHashMap(); // Load data//from w ww. ja va 2 s. c o m final Firehose firehose = firehoseFactory.connect(); try { while (firehose.hasMore()) { final InputRow inputRow = firehose.nextRow(); if (interval.contains(inputRow.getTimestampFromEpoch())) { // Extract dimensions from event for (final String dim : inputRow.getDimensions()) { final List<String> dimValues = inputRow.getDimension(dim); if (!unusableDimensions.contains(dim)) { if (dimValues.size() == 1) { // Track this value TreeMultiset<String> dimensionValueMultiset = dimensionValueMultisets.get(dim); if (dimensionValueMultiset == null) { dimensionValueMultiset = TreeMultiset.create(); dimensionValueMultisets.put(dim, dimensionValueMultiset); } dimensionValueMultiset.add(dimValues.get(0)); } else { // Only single-valued dimensions can be used for partitions unusableDimensions.add(dim); dimensionValueMultisets.remove(dim); } } } } } } finally { firehose.close(); } // ShardSpecs for index generator tasks final List<ShardSpec> shardSpecs = Lists.newArrayList(); // Select highest-cardinality dimension Ordering<Map.Entry<String, TreeMultiset<String>>> byCardinalityOrdering = new Ordering<Map.Entry<String, TreeMultiset<String>>>() { @Override public int compare(Map.Entry<String, TreeMultiset<String>> left, Map.Entry<String, TreeMultiset<String>> right) { return Ints.compare(left.getValue().elementSet().size(), right.getValue().elementSet().size()); } }; if (dimensionValueMultisets.isEmpty()) { // No suitable partition dimension. We'll make one big segment and hope for the best. log.info("No suitable partition dimension found"); shardSpecs.add(new NoneShardSpec()); } else { // Find best partition dimension (heuristic: highest cardinality). final Map.Entry<String, TreeMultiset<String>> partitionEntry = byCardinalityOrdering .max(dimensionValueMultisets.entrySet()); final String partitionDim = partitionEntry.getKey(); final TreeMultiset<String> partitionDimValues = partitionEntry.getValue(); log.info("Partitioning on dimension[%s] with cardinality[%d] over rows[%d]", partitionDim, partitionDimValues.elementSet().size(), partitionDimValues.size()); // Iterate over unique partition dimension values in sorted order String currentPartitionStart = null; int currentPartitionSize = 0; for (final String partitionDimValue : partitionDimValues.elementSet()) { currentPartitionSize += partitionDimValues.count(partitionDimValue); if (currentPartitionSize >= targetPartitionSize) { final ShardSpec shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, partitionDimValue, shardSpecs.size()); log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); currentPartitionSize = partitionDimValues.count(partitionDimValue); currentPartitionStart = partitionDimValue; } } if (currentPartitionSize > 0) { // One last shard to go final ShardSpec shardSpec; if (shardSpecs.isEmpty()) { shardSpec = new NoneShardSpec(); } else { shardSpec = new SingleDimensionShardSpec(partitionDim, currentPartitionStart, null, shardSpecs.size()); } log.info("Adding shard: %s", shardSpec); shardSpecs.add(shardSpec); } } List<Task> nextTasks = Lists.transform(shardSpecs, new Function<ShardSpec, Task>() { @Override public Task apply(ShardSpec shardSpec) { return new IndexGeneratorTask(null, getGroupId(), getImplicitLockInterval().get(), firehoseFactory, new Schema(schema.getDataSource(), schema.getSpatialDimensions(), schema.getAggregators(), schema.getIndexGranularity(), shardSpec), rowFlushBoundary); } }); toolbox.getTaskActionClient().submit(new SpawnTasksAction(nextTasks)); return TaskStatus.success(getId()); }