Example usage for com.mongodb BasicDBObject getLong

List of usage examples for com.mongodb BasicDBObject getLong

Introduction

In this page you can find the example usage for com.mongodb BasicDBObject getLong.

Prototype

public long getLong(final String key, final long def) 

Source Link

Document

Returns the value of a field as an long .

Usage

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

/**
 * Checks if the new params MAX_SPLITS and MAX_DOCS_PER_SPLIT are set
 * in the config.  If they are it will use those to do splits via limit/skip
 * otherwise it will call the previous chunking splitter in MongoSplitter.
 * // ww w . j  av  a2s. c  om
 * @param conf
 * @return
 */

public static List<InputSplit> calculateSplits(InfiniteMongoConfig conf) {
    // First off: What is our sharding scheme?

    boolean shardingPolicyNew = false;
    try {
        BasicDBObject shardQuery = new BasicDBObject("_id", "doc_metadata.metadata");
        BasicDBObject shardInfo = (BasicDBObject) DbManager.getCollection("config", "collections")
                .findOne(shardQuery);
        if (null != shardInfo) {
            BasicDBObject shardInfoKey = (BasicDBObject) shardInfo.get("key");
            if (null != shardInfoKey) {
                shardingPolicyNew = (shardInfoKey.size() > 1);
            }
        }
    } //TESTED (new and old)
    catch (Exception e) {
    } // stick with the old sharding, it's probably going to die soon after though, honestly

    // conf.getQuery returns a new copy of the query, so get once and use everywhere...
    BasicDBObject confQuery = (BasicDBObject) conf.getQuery();

    BasicDBObject srcTagsQuery = (BasicDBObject) conf.getSourceTags();

    String collection = conf.getInputURI().getCollection();
    if (!collection.equals(DbManager.getDocument().getContent().getName())
            && !collection.equals(DbManager.getDocument().getMetadata().getName())) {
        // Case 1: feature table or custom table
        // Just run legacy code
        return calculateSplits_phase2(conf, confQuery, false, false, null);
    } else { // complex cases...
        boolean simpleOtherIndex = false;
        // Check whether a simple query has been performed on a different indexed field         
        if (null == srcTagsQuery) { // (if srcTags specified, then going to want to use sourceKey as the index)
            for (String s : Arrays.asList(EntityPojo.docQuery_index_, DocumentPojo.url_)) {
                Object selector = confQuery.get(s);
                if (selector instanceof String) {
                    simpleOtherIndex = true;
                    break;
                } else if (selector instanceof DBObject) {
                    DBObject selectorDbo = (DBObject) selector;
                    if (selectorDbo.containsField(DbManager.in_)) {
                        simpleOtherIndex = true;
                        break;
                    }
                }
            } //TESTED (both types, plus check complex indexes don't work)         
              // ALLOWED: {"entities.index": { "$in": [ "xxx", "yyy"] }, {"entities.index": "xxx" }, ditto for "url"
              // NOT ALLOWED: { "entities.index": { "$ne": "xxx" } }
        }
        //TESTED check ignored if eg entity_index specified

        if (simpleOtherIndex) {
            // Case 2: we have a simple query on an indexed field 
            // Just run legacy code

            return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
        } //TESTED
        else if (conf.getLimit() > 0) { // debug
            //Case 3: Ensure we have small sets of sources to search over
            BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery, srcTagsQuery,
                    conf.getMaxDocsPerSplit());
            final List<InputSplit> splits = new ArrayList<InputSplit>();

            boolean queryNonTrivial = isQueryNonTrivial(confQuery);
            if (!queryNonTrivial) {
                //Case 3a: query is trivial, so can just create splits directly from the split pre-calcs
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    int toGet = (docCount > toProcess) ? toProcess : docCount;
                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(), modQuery,
                                conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                        toProcess -= docCount;
                    }
                } //TESTED
            } else {
                // Case 3b: annoying, some extra query terms, gonna need to do it the hard way...
                int toProcess = conf.getLimit();
                Iterator<Object> itSplit = collectionOfSplits.iterator();
                DBCollection coll = InfiniteMongoConfigUtil.getCollection(conf.getInputURI());
                while ((toProcess > 0) && (itSplit.hasNext())) {
                    BasicDBObject split = (BasicDBObject) itSplit.next();

                    BasicDBObject modQuery = convertQuery(confQuery, split.get(DocumentPojo.sourceKey_));
                    if (null != modQuery) {
                        int docsCounted = (int) coll.getCount(modQuery, null, toProcess, 0);
                        int toGet = (docsCounted > toProcess) ? toProcess : docsCounted;
                        if (docsCounted > 0) {
                            splits.add(new InfiniteMongoInputSplit(conf.getInputURI(), conf.getInputKey(),
                                    modQuery, conf.getFields(), conf.getSort(), toGet, 0, conf.isNoTimeout()));
                            toProcess -= docsCounted;
                        }
                    } //TESTED
                }
            } //TESTED

            return splits;
        } else { // More complex cases:

            if (shardingPolicyNew) {
                // Case 4a: NEW SHARDING SCHEME

                // Always fetch the new sources, eg convert communityId to sourceKeys
                try {
                    splitPrecalculations_newShardScheme(confQuery, srcTagsQuery); // (modifies confQuery if returns true)            
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);

                    return calculateSplits_phase2(conf, confQuery, !queryNonTrivial, shardingPolicyNew, null);

                    // (ie trivial query => always use chunks, bypass skip/limit test)
                } //TESTED (trivial + non-trivial)
                catch (Exception e) { // Didn't match any sources, no problem
                    return new ArrayList<InputSplit>();
                } //TESTED

            } //TESTED
            else {

                BasicDBList collectionOfSplits = splitPrecalculations_oldShardSchemeOrDebug(confQuery,
                        srcTagsQuery, conf.getMaxDocsPerSplit());

                if (null == collectionOfSplits) {
                    // Case 4b: OLD SHARDING SCHEME can't get a partition by source keys, just back off to old code
                    return calculateSplits_phase2(conf, confQuery, false, shardingPolicyNew, null);
                } //TESTED (old code)
                else {
                    conf.setMaxDocsPerSplit(2 * conf.getMaxDocsPerSplit());
                    // (because we stop creating splits when the exceed the size)

                    // Case 4c: OLD SHARDING SCHEME, have a source key partition
                    int nMaxCount = 1 + conf.getMaxDocsPerSplit() * conf.getMaxSplits();
                    boolean queryNonTrivial = isQueryNonTrivial(confQuery);
                    final List<InputSplit> splits = new ArrayList<InputSplit>();

                    BasicDBObject savedQuery = confQuery;

                    Iterator<Object> itSplit = collectionOfSplits.iterator();
                    BasicDBList bigSplit = null;
                    while (itSplit.hasNext()) {
                        BasicDBObject split = (BasicDBObject) itSplit.next();
                        int docCount = (int) split.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                        if (docCount < nMaxCount) { // small split, will use skip/limit
                            BasicDBObject modQuery = convertQuery(savedQuery,
                                    split.get(DocumentPojo.sourceKey_));
                            if (null != modQuery) {

                                final int SPLIT_THRESHOLD = 3;
                                // A few cases:
                                if ((docCount < (SPLIT_THRESHOLD * conf.getMaxDocsPerSplit()))
                                        || !queryNonTrivial) {
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, (Integer) docCount));
                                } //TESTED (based on limit, based on query)
                                else {
                                    // My guess at the point at which you might as well as do the full query in the hope you're going
                                    // to save some (empty) splits
                                    splits.addAll(calculateSplits_phase2(conf, modQuery, false,
                                            shardingPolicyNew, null));
                                } //TESTED
                            } //TESTED
                        } else { // large split, combine all these guys into an array of source keys
                            if (null == bigSplit) {
                                bigSplit = new BasicDBList();
                            }
                            bigSplit.add(split.get(DocumentPojo.sourceKey_));
                            // (guaranteed to be a single element)
                        }
                    } //(end loop over collections)

                    if (null != bigSplit) {

                        // If we have a big left over community then create a set of splits for that - always chunks if query trivial
                        if (1 == bigSplit.size()) {
                            confQuery.put(DocumentPojo.sourceKey_, bigSplit.iterator().next());
                        } else {
                            confQuery.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, bigSplit));
                        }
                        splits.addAll(calculateSplits_phase2(conf, confQuery, !queryNonTrivial,
                                shardingPolicyNew, null));
                    } //TESTED: singleton+trivial (sandy), array+trivial (sentiment/enron), array+non-trivial (sentiment/enron, docGeo), singleton+non-trivial (sandy, docGeo)

                    return splits;

                } //TESTED: end if Cases 4a, 4b, 4c

            } //(end if old vs new sharding policy)

        } //(non-debug case)
    } //(content or metadata table are most complex)
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter.java

License:Apache License

@SuppressWarnings("unchecked")
public static BasicDBList splitPrecalculations_oldShardSchemeOrDebug(BasicDBObject query,
        BasicDBObject srcTagsQuery, int maxCountPerTask) {
    // Get the communityIds from the query
    Collection<ObjectId> communityIds = null;
    try {//from   w  w  w.  ja  v  a  2  s.  com
        BasicDBObject communityIdsIn = (BasicDBObject) query.get(DocumentPojo.communityId_);
        communityIds = (Collection<ObjectId>) communityIdsIn.get(DbManager.in_);
        if (null == communityIds) {
            return null;
        }
    } catch (Exception e) {
        return null; // back out
    }

    BasicDBObject keyQuery = new BasicDBObject(SourcePojo.communityIds_,
            new BasicDBObject(DbManager.in_, communityIds));
    BasicDBObject keyFields = new BasicDBObject(SourcePojo.key_, 1);
    keyFields.put(SourceHarvestStatusPojo.sourceQuery_doccount_, 1);
    BasicDBObject sortFields = new BasicDBObject(SourcePojo.key_, 1);

    // Get and remove the sourceKey information, incorporate into source query:
    Object sourceKeyQueryTerm = query.get(DocumentPojo.sourceKey_);
    if (null != sourceKeyQueryTerm) {
        keyQuery.put(SourcePojo.key_, sourceKeyQueryTerm);
    } //TESTED
    if (null != srcTagsQuery) {
        keyQuery.put(SourcePojo.tags_, srcTagsQuery.get(SourcePojo.tags_));
    } //TESTED

    DBCursor dbc = MongoDbManager.getIngest().getSource().find(keyQuery, keyFields).sort(sortFields);
    // (note the sort is needed so that the potentially expensive doc query has a sensibly ordered $in clause)
    if (dbc.count() > 5000) {
        // (too many source keys to process, just going to leave well alone... note this means $srctags will fail open)
        return null;
    } else {
        //TreeMap<String, Long> sourceKeys = new TreeMap<String, Long>();
        // Build collections of objects of format { sourceKey: string or [], totalDocs }
        BasicDBList sourceKeyListCollection = new BasicDBList();
        BasicDBList sourceKeyList = null;
        int runningDocs = 0;
        int runningSources = 0;
        while (dbc.hasNext()) {
            BasicDBObject dbo = (BasicDBObject) dbc.next();
            String sourceKey = (String) dbo.get(SourcePojo.key_);
            if (null != sourceKey) {
                long docCount = 0L;
                try {
                    BasicDBObject harvestStatus = (BasicDBObject) dbo.get(SourcePojo.harvest_);
                    if (null != harvestStatus) {
                        docCount = harvestStatus.getLong(SourceHarvestStatusPojo.doccount_, 0L);
                    }
                } catch (Exception e) {
                }

                //DEBUG
                //System.out.println("SOURCE=" + sourceKey + " DOC_COUNT=" + docCount + " RUNNING=" + runningDocs +"," + runningSources + ": " + sourceKeyList);

                if (docCount > maxCountPerTask) { // source is large enough by itself
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKey);
                    collection.put(SourceHarvestStatusPojo.doccount_, docCount);
                    sourceKeyListCollection.add(collection);
                    // (leaving running* alone, can keep building that)
                } //TESTED (by eye, system community of demo cluster)
                else if ((runningDocs + docCount) > maxCountPerTask) { // have now got a large enough collection of sources 
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else if (runningSources >= 15) { // have a limit on the number of sources per query, to keep the queries manageable
                    sourceKeyList.add(sourceKey);
                    // Create collection
                    BasicDBObject collection = new BasicDBObject();
                    collection.put(DocumentPojo.sourceKey_, sourceKeyList);
                    collection.put(SourceHarvestStatusPojo.doccount_, runningDocs + docCount);
                    sourceKeyListCollection.add(collection);
                    sourceKeyList = null;
                    runningDocs = 0;
                    runningSources = 0;
                } //TESTED (by eye, system community of demo cluster)
                else { // (keep) build(ing) list
                    if (null == sourceKeyList) {
                        sourceKeyList = new BasicDBList();
                    }
                    sourceKeyList.add(sourceKey);
                    runningDocs += docCount;
                    runningSources++;
                } //TESTED (by eye, system community of demo cluster)
            } //(end if has source key)
        } //(end loop over cursor)

        // Finish off:
        if (null != sourceKeyList) {
            // Create collection
            BasicDBObject collection = new BasicDBObject();
            collection.put(DocumentPojo.sourceKey_, sourceKeyList);
            collection.put(SourceHarvestStatusPojo.doccount_, runningDocs);
            sourceKeyListCollection.add(collection);
        } //TESTED (by eye, system community of demo cluster)

        if (sourceKeyListCollection.isEmpty()) { // query returns empty
            throw new RuntimeException("Communities contain no sources");
        }
        return sourceKeyListCollection;

    } // (end if too many source keys across the communities)
}