Example usage for com.mongodb DBCollection getCount

List of usage examples for com.mongodb DBCollection getCount

Introduction

In this page you can find the example usage for com.mongodb DBCollection getCount.

Prototype

@Deprecated
public long getCount(@Nullable final DBObject query, @Nullable final DBObject projection, final long limit,
        final long skip) 

Source Link

Document

Get the count of documents in collection that would match a criteria.

Usage

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

/**
 * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set
 * in the config.  If they are it will use those to do splits via limit/skip
 * otherwise it will call the previous chunking splitter in MongoSplitter.
 * //ww  w.j a  v a2s.  com
 * @param conf
 * @return
 */

public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) {
    // First off: What is our sharding scheme?

    boolean shardingPolicyNew = false;
    try {
        BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata");
        BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections")
                .findOne(shardQuery);
        if (null != shardInfo) {
            BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key");
            if (null != shardInfoKey) {
                shardingPolicyNew = (shardInfoKey.size() > 1);
            }
        }
    } //TESTED (new and old)
    catch (Exception e) {
    } // stick with the old sharding, it's probably going to die soon after though, honestly

    // conf.getQuery returns a new copy of the query, so get once and use everywhere...
    BasicDBObject confQuery = (BasicDBObject) conf.getQuery();

    BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags();

    String collection = conf.getInputURI().getCollection();
    if (!collection.equals(DbManager.getDocument().getContent().getName())
            && !collection.equals(DbManager.getDocument().getMetadata().getName())) {
        // Case 1: feature table or custom table
        // Just run legacy code
        return calculateSplits_phase2(conf, confQuery, false, false, null);
    } else { // complex cases...
        boolean simpleOtherIndex = false;
        // Check whether a simple query has been performed on a different indexed field         
        if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index)
            for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) {
                Object selector = confQuery.get(s);
                if (selector instanceof String) {
                    simpleOtherIndex = true;
                    break;
                } else if (selector instanceof DBObject) {
                    DBObject selectorDbo = (DBObject) selector;
                    if (selectorDbo.containsField(DbManager.in_)) {
                        simpleOtherIndex = true;
                        break;
                    }
                }
            } //TESTED (both types, plus check complex indexes don't work)         
              // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url"
              // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } }
        }
        //TESTED check ignored if eg entity_index specified

        if (simpleOtherIndex) {
            // Case 2: we have a simple query on an indexed field 
            // Just run legacy code

            return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
        } //TESTED
        else if (conf.getLimit() > 0) { // debug
            //Case 3: Ensure we have small sets of sources to search over
            BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery,
                    conf.getMaxDocsPerSplit());
            final List<InputSplit> splits = new ArrayList<InputSplit>();

            boolean queryNonTrivial = isQueryNonTrivial(confQuery);
            if (!queryNonTrivial) {
                //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    int toGet = (docCount > toProcess) ? toProcess : docCount;
                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery,
                                conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                        toProcess -= docCount;
                    }
                } //TESTED
            } else {
                // Case 3b: annoying, some extra query terms, gonna need to do it the hard way...
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI());
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0);
                        int toGet = (docsCounted > toProcess) ? toProcess : docsCounted;
                        if (docsCounted > 0) {
                            splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                                    modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                            toProcess -= docsCounted;
                        }
                    } //TESTED
                }
            } //TESTED

            return splits;
        } else { // More complex cases:

            if (shardingPolicyNew) {
                // Case 4a: NEW SHARDING SCHEME

                // Always fetch the new sources, eg convert communityId to sourceKeys
                try {
                    splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true)            
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);

                    return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null);

                    // (ie trivial query => always use chunks, bypass skip/limit test)
                } //TESTED (trivial + non-trivial)
                catch (Exception e) { // Didn't match any sources, no problem
                    return new ArrayList<InputSplit>();
                } //TESTED

            } //TESTED
            else {

                BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery,
                        srcTagsQuery, conf.getMaxDocsPerSplit());

                if (null == collectionOfSplits) {
                    // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code
                    return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
                } //TESTED (old code)
                else {
                    conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit());
                    // (because we stop creating splits when the exceed the size)

                    // Case 4c: OLD SHARDING SCHEME, have a source key partition
                    int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);
                    final List<InputSplit> splits = new ArrayList<InputSplit>();

                    BasicDBObject savedQuery = confQuery;

                    Iterator<Object> itSplit = collectionOfSplits.iterator();
                    BasicDBList bigSplit = null;
                    while (itSplit.hasNext()) {
                        BasicDBObject split = (BasicDBObject) itSplit.next();
                        int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                        if (docCount < nMaxCount) { // small split, will use skip/limit
                            BasicDBObject modQuery = convertQuery(savedQuery,
                                    split.get(DocumentPojo.sourceKey_));
                            if (null != modQuery) {

                                final int SPLIT_THRESHOLD = 3;
                                // A few cases:
                                if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit()))
                                        || !queryNonTrivial) {
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, (Integer) docCount));
                                } //TESTED (based on limit, based on query)
                                else {
                                    // My guess at the point at which you might as well as do the full query in the hope you're going
                                    // to save some (empty) splits
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, null));
                                } //TESTED
                            } //TESTED
                        } else { // large split, combine all these guys into an array of source keys
                            if (null == bigSplit) {
                                bigSplit = new BasicDBList();
                            }
                            bigSplit.add(split.get(DocumentPojo.sourceKey_));
                            // (guaranteed to be a single element)
                        }
                    } //(end loop over collections)

                    if (null != bigSplit) {

                        // If we have a big left over community then create a set of splits for that - always chunks if query trivial
                        if (1 == bigSplit.size()) {
                            confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next());
                        } else {
                            confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit));
                        }
                        splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial,
                                shardingPolicyNew, null));
                    } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo)

                    return splits;

                } //TESTED: end if Cases 4a, 4b, 4c

            } //(end if old vs new sharding policy)

        } //(non-debug case)
    } //(content or metadata table are most complex)
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static List<InputSplit> calculateSplits_phase2(InfiniteMongoConfig conf, BasicDBObject confQuery,
        boolean alwaysUseChunks, boolean newShardScheme, Integer splitDocCount) {
    alwaysUseChunks &= (conf.getMaxSplits() != MAX_SPLITS);
    // (in standalone mode, never use chunks)

    MongoURI uri = conf.getInputURI();/*from w  w w  .ja  v a  2  s.  com*/
    DBCollection coll = InfiniteMongoConfigUtil.getCollection(uri);
    if (conf.getLimit() > 0) {
        return calculateManualSplits(conf, confQuery, 1, conf.getLimit(), coll);
    } else {
        if (!alwaysUseChunks) {
            int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
            int count = 0;
            if (null == splitDocCount) {
                if (nMaxCount <= 1) {
                    nMaxCount = 0;
                } else {
                    //DEBUG
                    //System.out.println(coll.find(confQuery).limit(1).explain());

                    count = (int) coll.getCount(confQuery, null, nMaxCount, 0);
                    if (0 == count) {
                        return new ArrayList<InputSplit>();
                    }
                } //TESTED
            } else {
                count = splitDocCount;
            }

            //if maxdocssplit and maxsplits is set and there are less documents than splits*docspersplit then use the new splitter
            //otherwise use the old splitter
            if (conf.getMaxDocsPerSplit() > 0 && conf.getMaxSplits() > 0 && (count < nMaxCount)) {
                _logger.debug("Calculating splits manually");
                int splits_needed = (count / conf.getMaxDocsPerSplit()) + 1;

                return calculateManualSplits(conf, confQuery, splits_needed, conf.getMaxDocsPerSplit(), coll);
            } //TESTED
        }
        if (newShardScheme && !confQuery.containsField(DocumentPojo.sourceKey_)) {
            // OK if we're going to do the sharded version then we will want to calculate
            splitPrecalculations_newShardScheme(confQuery, null); // (modifies confQuery if returns true)            
        } //TESTED: checked did nothing when had sourceKey, added sourceKey when necessary (eg entities.index case)

        if (!newShardScheme) { // unlike new sharding scheme, in this case the query is fixed, so overwrite now:
            conf.setQuery(confQuery);
        }

        List<InputSplit> splits = MongoSplitter.calculateSplits(conf);
        // (unless manually set, like above, runs with the _original_ query)
        int initialSplitSize = splits.size();

        // We have the MongoDB-calculated splits, now calculate their intersection vs the query
        @SuppressWarnings("rawtypes")
        Map<String, TreeSet<Comparable>> orderedArraySet = new HashMap<String, TreeSet<Comparable>>();
        @SuppressWarnings("rawtypes")
        Map<String, NavigableSet<Comparable>> orderedArraySet_afterMin = new HashMap<String, NavigableSet<Comparable>>();
        BasicDBObject originalQuery = confQuery;

        ArrayList<InputSplit> newsplits = new ArrayList<InputSplit>(splits.size());
        Iterator<InputSplit> splitIt = splits.iterator();
        while (splitIt.hasNext()) {
            try {
                orderedArraySet_afterMin.clear();

                MongoInputSplit mongoSplit = (MongoInputSplit) splitIt.next();
                BasicDBObject min = (BasicDBObject) mongoSplit.getQuerySpec().get("$min");
                BasicDBObject max = (BasicDBObject) mongoSplit.getQuerySpec().get("$max");

                //DEBUG
                //_logger.info("+----------------- NEW SPLIT ----------------: " + min + " /" + max);
                //System.out.println("+----------------- NEW SPLIT ----------------: " + min + " /" + max);

                if (null != min) { // How does the min fit in with the general query
                    try {
                        if (compareFields(-1, originalQuery, min, max, orderedArraySet,
                                orderedArraySet_afterMin) < 0) {
                            splitIt.remove();
                            continue;
                        }
                    } catch (Exception e) {
                    } // do nothing probably just some comparable issue
                } //TESTED

                if (null != max) { // How does the min fit in with the general query
                    try {
                        if (compareFields(1, originalQuery, max, min, orderedArraySet,
                                orderedArraySet_afterMin) > 0) {
                            splitIt.remove();
                            continue;
                        }
                    } catch (Exception e) {
                    } // do nothing probably just some comparable issue
                } //TESTED

                //DEBUG
                //_logger.info("(retained split)");
                //System.out.println("(retained split)");

                // (don't worry about edge cases, won't happen very often and will just result in a spurious empty mapper)

                ////////////////////////////////

                // Now some infinit.e specific processing...

                if (newShardScheme) {
                    @SuppressWarnings("rawtypes")
                    TreeSet<Comparable> sourceKeyOrderedArray = orderedArraySet.get(DocumentPojo.sourceKey_);
                    if ((null != sourceKeyOrderedArray) && !sourceKeyOrderedArray.isEmpty()) {
                        @SuppressWarnings("rawtypes")
                        Comparable minSourceKey = null;
                        Object minSourceKeyObj = (null == min) ? null : min.get(DocumentPojo.sourceKey_);
                        if (minSourceKeyObj instanceof String) {
                            minSourceKey = (String) minSourceKeyObj;
                        }
                        if (null == minSourceKey) {
                            minSourceKey = sourceKeyOrderedArray.first();
                        } //TESTED
                        @SuppressWarnings("rawtypes")
                        Comparable maxSourceKey = null;
                        Object maxSourceKeyObj = (null == max) ? null : max.get(DocumentPojo.sourceKey_);
                        if (maxSourceKeyObj instanceof String) {
                            maxSourceKey = (String) maxSourceKeyObj;
                        }
                        if (null == maxSourceKey) {
                            maxSourceKey = sourceKeyOrderedArray.last();
                        } //TESTED

                        DBObject splitQuery = mongoSplit.getQuerySpec();
                        BasicDBObject splitQueryQuery = new BasicDBObject(
                                (BasicBSONObject) splitQuery.get("$query"));
                        if (0 == minSourceKey.compareTo(maxSourceKey)) { // single matching sourceKEy
                            splitQueryQuery.put(DocumentPojo.sourceKey_, maxSourceKey);
                        } //TESTED (array of sources, only one matches)
                        else { // multiple matching source keys
                            splitQueryQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_,
                                    sourceKeyOrderedArray.subSet(minSourceKey, true, maxSourceKey, true)));
                        } //TESTED (array of sources, multiple match)               
                        newsplits.add(
                                new InfiniteMongoInputSplit(mongoSplit, splitQueryQuery, conf.isNoTimeout()));
                    } else { // original query is of sufficient simplicity
                        newsplits.add(
                                new InfiniteMongoInputSplit(mongoSplit, originalQuery, conf.isNoTimeout()));
                    } //TESTED (no change to existing source)

                } //TESTED
                else { // old sharding scheme, remove min/max and replace with normal _id based query where possible

                    DBObject splitQuery = mongoSplit.getQuerySpec();
                    // Step 1: create a query range for _id:
                    BasicDBObject idRange = null;
                    Object idMin = (min == null) ? null : min.get(DocumentPojo._id_);
                    Object idMax = (max == null) ? null : max.get(DocumentPojo._id_);
                    if (!(idMin instanceof ObjectId))
                        idMin = null;
                    if (!(idMax instanceof ObjectId))
                        idMax = null;

                    if ((null != idMin) || (null != idMax)) {
                        idRange = new BasicDBObject();
                        if (null != idMin) {
                            idRange.put(DbManager.gte_, idMin);
                        }
                        if (null != idMax) {
                            idRange.put(DbManager.lt_, idMax);
                        }
                    } //TESTED                  

                    // Step 2: merge with whatever we have at the moment:
                    if (null != idRange) {
                        BasicDBObject splitQueryQuery = new BasicDBObject(
                                (BasicBSONObject) splitQuery.get("$query"));
                        Object idQueryElement = splitQueryQuery.get(DocumentPojo._id_);
                        boolean convertedAwayFromMinMax = false;
                        if (null == idQueryElement) { // nice and easy, add _id range
                            splitQueryQuery.put(DocumentPojo._id_, idRange);
                            convertedAwayFromMinMax = true;
                        } //TESTED
                        else if (!splitQueryQuery.containsField(DbManager.and_)) { // OK we're going to just going to make life easy
                            splitQueryQuery.remove(DocumentPojo._id_);
                            splitQueryQuery.put(DbManager.and_,
                                    Arrays.asList(new BasicDBObject(DocumentPojo._id_, idQueryElement),
                                            new BasicDBObject(DocumentPojo._id_, idRange)));
                            convertedAwayFromMinMax = true;
                        } //TESTED
                          // (else stick with min/max)

                        if (convertedAwayFromMinMax) { // can construct an _id query
                            splitQuery.removeField("$min");
                            splitQuery.removeField("$max");
                        } //TESTED
                        splitQuery.put("$query", splitQueryQuery);
                    }
                    newsplits.add(new InfiniteMongoInputSplit(mongoSplit, conf.isNoTimeout()));
                } //TESTED         
            } catch (Exception e) {
                //DEBUG
                //e.printStackTrace();
            } // do nothing must be some other type of input split
        } //TESTED

        //DEBUG
        //System.out.println("Calculating splits via mongo-hadoop: " + initialSplitSize + " reduced to " + splits.size());

        _logger.info("Calculating (converted) splits via mongo-hadoop: " + initialSplitSize + " reduced to "
                + newsplits.size());
        return newsplits;
    }
}