Example usage for com.mongodb DBCollection getDBDecoderFactory

Introduction

In this page you can find the example usage for com.mongodb DBCollection getDBDecoderFactory.

Prototype

public synchronized DBDecoderFactory getDBDecoderFactory()

Source Link

Document

Get the decoder factory for this collection.

Usage

From source file:com.ikanow.infinit.e.api.knowledge.processing.ScoringUtils.java

License:Open Source License

@SuppressWarnings("unchecked")
private void stage1_initialCountingLoop(DBCursor docs, AdvancedQueryPojo.QueryScorePojo scoreParams,
        int toReturn, StatisticsPojo scores, LinkedList<BasicDBObject> standaloneEventsReturn,
        int nCommunities) {
    double s0_nQuerySubsetDocCountInv = 1.0 / (double) _s0_nQuerySubsetDocCount;

    // Some memory management:
    DBCollection dbc = MongoDbManager.getDocument().getMetadata();
    DBDecoderFactory defaultDecoder = dbc.getDBDecoderFactory();

    try {/* w w  w.  j a v  a  2  s .  co  m*/
        SizeReportingBasicBSONDecoder sizeReportingDecoder = new SizeReportingBasicBSONDecoder();
        dbc.setDBDecoderFactory(sizeReportingDecoder);

        long currMemUsage = 0;
        int ndocs = 0;
        long lastBatch = 0L;

        long initialUnusedMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory();
        long initialFreeMemory = Runtime.getRuntime().freeMemory();

        for (DBObject f0 : docs) {
            BasicDBObject f = (BasicDBObject) f0;
            long newMemUsage = sizeReportingDecoder.getSize();
            if ((newMemUsage - currMemUsage) > 0) { // check every batch               
                long now = new Date().getTime();

                //DEBUG
                //logger.warn(ndocs + " : " + (now - lastBatch) + " : " + newMemUsage + " VS " + Runtime.getRuntime().maxMemory() + " UNUSED " + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()) + " FREE " + Runtime.getRuntime().freeMemory());

                // Check vs total memory:
                long runtimeMem = Runtime.getRuntime().maxMemory();
                // note newMemUsage is the input memory ... gets expanded ~6x by the BSON-ification, allowed at most 1/4rd of memory...
                // Also if we're taking more than 20s for a batch then limp over the limit and exit...
                if (((newMemUsage * 24) > runtimeMem)
                        || (((now - lastBatch) > 20000L) && (ndocs >= toReturn))) {
                    long finalUnusedMemory = Runtime.getRuntime().maxMemory()
                            - Runtime.getRuntime().totalMemory();
                    long finalFreeMemory = Runtime.getRuntime().freeMemory();

                    logger.error("Query truncated memUsage=" + newMemUsage + ", memory=" + runtimeMem
                            + ", docs=" + ndocs + ", totaldocs=" + scores.found + ", init_free_mem="
                            + initialFreeMemory + ", end_free_mem=" + finalFreeMemory + ", init_unused_mem="
                            + initialUnusedMemory + ", end_unused_mem=" + finalUnusedMemory);
                    break;
                } //TESTED
                currMemUsage = newMemUsage;
                lastBatch = now;
            } //TESTED
            ndocs++;

            // Simple handling for standalone events
            if ((null != _s0_standaloneEventAggregator) && !_s0_bNeedToCalcSig) {
                //if _s0_bNeedToCalcSig then do this elsewhere
                ScoringUtils_Associations.addStandaloneEvents(f, 0.0, 0, _s0_standaloneEventAggregator,
                        _s0_bEntityTypeFilterPositive, _s0_bAssocVerbFilterPositive, _s0_entityTypeFilter,
                        _s0_assocVerbFilter, _s0_bEvents, _s0_bSummaries, _s0_bFacts);
            } //TESTED

            if (!_s0_bNeedToCalcSig) {
                continue;
            } //TESTED

            if (nCommunities > 1) { // (could have pan-community entities)
                ObjectId communityId = (ObjectId) f.get(DocumentPojo.communityId_);
                if (null != communityId) { // (have big problems if so, but anyway!)
                    int retval = _s0_multiCommunityHandler.community_getIdAndInitialize(communityId,
                            _s1_entitiesInDataset);
                    // (returns an int community id but also sets it into the cache, so just use that below)
                    if (Integer.MIN_VALUE == retval) {
                        //this document cannot be viewed from within this set of communities
                        continue;
                    }
                }
            } //TESTED      

            TempDocBucket docBucket = new TempDocBucket();
            docBucket.dbo = f;
            ObjectId id = (ObjectId) f.get(DocumentPojo._id_);

            // If we're going to weight relevance in, or we need the geo temporal decay:
            if ((0 != scoreParams.relWeight) || (null != scoreParams.timeProx)
                    || (null != scoreParams.geoProx)) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.explain = scoreObj.explain; // (will normally be null)
                    docBucket.luceneScore = scoreObj.score;
                    if ((null != scoreParams.timeProx) || (null != scoreParams.geoProx)) {
                        if (scoreObj.decay >= 0.0) {
                            docBucket.geoTemporalDecay = scoreObj.decay;
                        }
                        // (see also below for low accuracy geo scoring)
                    }
                } else {
                    docBucket.luceneScore = 1.0;
                }
            } //TESTED
            else if (this._s0_sortingByDate) {
                StatisticsPojo.Score scoreObj = scores.getScore().get(id);
                if (null != scoreObj) {
                    docBucket.nLuceneIndex = scoreObj.nIndex;
                }
            }
            docBucket.manualWeighting = this.getManualScoreWeights(scoreParams, f);

            BasicDBList l = (BasicDBList) (f.get(DocumentPojo.entities_));
            if (null != l) {

                long nEntsInDoc = l.size();
                double dBestGeoScore = 0.0; // (for low accuracy geo only)
                for (Iterator<?> e0 = l.iterator(); e0.hasNext();) {
                    BasicDBObject e = (BasicDBObject) e0.next();
                    BasicDBObject tmpGeotag = null;
                    if (_s3_bLowAccuracyGeo || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                        // low accuracy geo, need to look for geotag
                        tmpGeotag = (BasicDBObject) e.get(EntityPojo.geotag_);
                    }

                    // Get attributes

                    double freq = -1.0;
                    long ntotaldoccount = -1;
                    String entity_index;
                    Double sentiment = null;
                    try {
                        sentiment = (Double) e.get(EntityPojo.sentiment_);
                        ntotaldoccount = e.getLong(EntityPojo.doccount_);
                        freq = e.getDouble(EntityPojo.frequency_);
                        entity_index = e.getString(EntityPojo.index_);
                        if (null == entity_index) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } catch (Exception ex) {
                        try {
                            String sfreq;
                            if (ntotaldoccount < 0) {
                                sfreq = e.getString(EntityPojo.doccount_);
                                ntotaldoccount = Long.valueOf(sfreq);
                            }
                            if (freq < -0.5) {
                                sfreq = e.getString(EntityPojo.frequency_);
                                freq = Long.valueOf(sfreq).doubleValue();
                            }
                            entity_index = e.getString(EntityPojo.index_);
                            if (null == entity_index) {
                                // Just bypass the entity 
                                e.put(EntityPojo.significance_, 0.0);
                                nEntsInDoc--;
                                continue;
                            }
                        } catch (Exception e2) {
                            // Just bypass the entity 
                            e.put(EntityPojo.significance_, 0.0);
                            nEntsInDoc--;
                            continue;
                        }
                    } //TESTED

                    // First loop through is just counting

                    // Retrieve entity (create/initialzie if necessary)
                    EntSigHolder shp = _s1_entitiesInDataset.get(entity_index);
                    if (null == shp) {
                        if (ntotaldoccount > (long) _s0_globalDocCount) { // obviously can't have more entities-in-dos than docs... 
                            ntotaldoccount = (long) _s0_globalDocCount;
                        }
                        shp = new EntSigHolder(entity_index, ntotaldoccount, _s0_multiCommunityHandler);

                        // Stage 1a alias handling: set up infrastructure, calculate doc overlap
                        if (null != _s1_aliasLookup) {
                            stage1_initAlias(shp);
                        }
                        if ((null != shp.aliasInfo) && (null == shp.masterAliasSH)) { // this is the discard alias
                            nEntsInDoc--;
                            continue;
                        } //TESTED

                        // Check if entity is in type filter list
                        if (null != _s0_entityTypeFilter) {
                            String entType = null;
                            if (null != shp.aliasInfo) {
                                entType = shp.aliasInfo.getType();
                            } else {
                                entType = e.getString(EntityPojo.type_);
                            }
                            if (_s0_bEntityTypeFilterPositive) {
                                if ((null != entType)
                                        && !_s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                    nEntsInDoc--;
                                    continue;
                                }
                            } else if ((null != entType)
                                    && _s0_entityTypeFilter.contains(entType.toLowerCase())) {
                                //(negative filter)
                                nEntsInDoc--;
                                continue;
                            }

                        } //TESTED (end entity filter)

                        // Geo:
                        if (null != shp.aliasInfo) {
                            if (null != shp.aliasInfo.getGeotag()) { //Geo, overwrite/create tmpGeotag
                                if (_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo
                                        || (null != _s1_dManualGeoDecay_latLonInvdecay)) {
                                    // Always capture alias geo, even if not in low accuracy mode because we add it to the 
                                    // legitimate geo:
                                    if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo)
                                            && (null == _s3_geoBuckets)) {
                                        // Initialize the buckets if this is for aggregation not just decay
                                        _s3_geoBuckets = (LinkedList<EntSigHolder>[]) new LinkedList[_s3_nGEO_BUCKETS];
                                    }

                                    if (null == tmpGeotag) {
                                        tmpGeotag = new BasicDBObject();
                                    }
                                    tmpGeotag.put(GeoPojo.lat_, shp.aliasInfo.getGeotag().lat);
                                    tmpGeotag.put(GeoPojo.lon_, shp.aliasInfo.getGeotag().lon);

                                    if (null != shp.aliasInfo.getOntology_type()) {
                                        e.put(EntityPojo.ontology_type_, shp.aliasInfo.getOntology_type());
                                    }
                                }
                            }
                        } //TESTED (end geo for aggregation or decay)

                        _s1_entitiesInDataset.put(entity_index, shp);
                        // end Stage 1a alias handling
                    } //(end if is alias)

                    // Stage 1b alias handling: calculate document counts (taking overlaps into account)
                    if (null != shp.masterAliasSH) {
                        // Counts:
                        shp.masterAliasSH.nTotalDocCount++;
                        // docs including overlaps
                        shp.masterAliasSH.avgFreqOverQuerySubset += freq;

                        // Keep track of overlaps:
                        if (f != shp.masterAliasSH.unusedDbo) {
                            shp.masterAliasSH.unusedDbo = f;
                            // (note this is only used in stage 1, alias.unusedDbo is re-used differently in stage 3/4)
                            shp.masterAliasSH.nDocCountInQuerySubset++;
                            // non-overlapping docs ie < shp.nDocCountInQuerySubset
                        }

                        // Sentiment:
                        shp.masterAliasSH.positiveSentiment += shp.positiveSentiment;
                        shp.masterAliasSH.negativeSentiment += shp.negativeSentiment;
                        if (null != sentiment) {
                            shp.masterAliasSH.nTotalSentimentValues++;
                        }

                    } //TESTED (end if is alias)
                      // end Stage 1b

                    // Pan-community logic (this needs to be before the entity object is updated)
                    if (_s0_multiCommunityHandler.isActive()) {
                        _s0_multiCommunityHandler.community_updateCorrelations(shp, ntotaldoccount,
                                entity_index);
                    } else { // (Once we've started multi-community logic, this is no longer desirable)
                        if ((ntotaldoccount > shp.nTotalDocCount) && (ntotaldoccount <= _s0_globalDocCount)) {
                            shp.nTotalDocCount = ntotaldoccount;
                        }
                        //(note there used to be some cases where we adjusted for dc/tf==0, but the 
                        // underlying issue in the data model that caused this has been fixed, so it's 
                        // now a pathological case that can be ignored)
                    } //(TESTED)

                    // Update counts:
                    _s1_sumFreqInQuerySubset += freq;
                    shp.avgFreqOverQuerySubset += freq;
                    shp.nDocCountInQuerySubset++;
                    shp.decayedDocCountInQuerySubset += docBucket.geoTemporalDecay;
                    // (note this doesn't handle low accuracy geo-decay ... we'll address that via a separate term)

                    TempEntityInDocBucket entBucket = new TempEntityInDocBucket();
                    entBucket.dbo = e;
                    entBucket.freq = freq;
                    entBucket.doc = docBucket;
                    shp.entityInstances.add(entBucket);
                    if (null != tmpGeotag) { // (only needed for low accuracy geo aggregation)

                        if ((_s3_bLowAccuracyGeo || _s3_bExtraAliasGeo) && (null == shp.geotag)) { // (first time for shp only)
                            shp.geotag = tmpGeotag;
                            shp.geotaggedEntity = e; // (ie for onto type, which has been overwritten in the alias case...)
                        }
                        if (null != _s1_dManualGeoDecay_latLonInvdecay) {
                            // Emulate scripted Lucene calculations
                            double minlat = tmpGeotag.getDouble(GeoPojo.lat_);
                            double minlon = tmpGeotag.getDouble(GeoPojo.lon_);
                            double paramlat = _s1_dManualGeoDecay_latLonInvdecay[0];
                            double paramlon = _s1_dManualGeoDecay_latLonInvdecay[1];
                            double gdecay = _s1_dManualGeoDecay_latLonInvdecay[2];
                            char ontCode = GeoOntologyMapping
                                    .encodeOntologyCode(e.getString(EntityPojo.ontology_type_));
                            double dDecay = QueryDecayScript.getGeoDecay(minlat, minlon, paramlat, paramlon,
                                    gdecay, ontCode);
                            if (dDecay > dBestGeoScore) {
                                dBestGeoScore = dDecay;
                            }
                        } //TESTED
                    } //(end if entity has geo and need to process entity geo)

                    if (freq > shp.maxFreq) {
                        shp.maxFreq = freq;
                    }
                    // Sentiment:
                    if ((null != sentiment) && (Math.abs(sentiment) <= 1.1)) { // (actually 1.0)
                        shp.nTotalSentimentValues++;
                        if (sentiment > 0.0) {
                            shp.positiveSentiment += sentiment;
                        } else {
                            shp.negativeSentiment += sentiment;
                        }
                    } else if (null != sentiment) { // corrupt sentiment for some reason?!
                        e.put(EntityPojo.sentiment_, null);
                    }
                    docBucket.docLength += freq;

                } //(end loop over entities)

                docBucket.nLeftToProcess = nEntsInDoc;
                docBucket.nEntsInDoc = (int) nEntsInDoc;

                if (null != this._s1_dManualGeoDecay_latLonInvdecay) { // Low accuracy geo-calculations
                    docBucket.geoTemporalDecay *= dBestGeoScore;
                    docBucket.luceneScore *= dBestGeoScore;
                    _s2_dAvgLowAccuracyGeoDecay += dBestGeoScore * s0_nQuerySubsetDocCountInv;
                } //TESTED            

            } // (end if feed has entities)

            // Handle documents with no entities - can still promote them
            if (0 == docBucket.nLeftToProcess) { // (use this rather than doc length in case all the entities had freq 0)
                _s1_noEntityBuckets.add(docBucket);
            }

        } // (end loop over feeds)
          //TESTED
    } finally {
        dbc.setDBDecoderFactory(defaultDecoder);
    }
}

From source file:com.ikanow.infinit.e.processing.custom.utils.CustomApiUtils.java

License:Apache License

public static void getJobResults(ResponsePojo rp, CustomMapReduceJobPojo cmr, int limit, String fields,
        String findStr, String sortStr, boolean bCsv) {

    BasicDBObject queryDbo = null;/*from  www . j  a va 2 s. c  o m*/
    if (null != findStr) {
        queryDbo = (BasicDBObject) com.mongodb.util.JSON.parse(findStr);
    } else {
        queryDbo = new BasicDBObject();
    } //TOTEST

    BasicDBObject fieldsDbo = new BasicDBObject();
    if (null != fields) {
        fieldsDbo = (BasicDBObject) com.mongodb.util.JSON.parse("{" + fields + "}");
    }

    //return the results:

    // Need to handle sorting...
    BasicDBObject sort = null;
    if (null != sortStr) { //override
        sort = (BasicDBObject) com.mongodb.util.JSON.parse(sortStr);
    } else { //defaults
        String sortField = "_id";
        int sortDir = 1;
        BasicDBObject postProcObject = (BasicDBObject) com.mongodb.util.JSON.parse(
                InfiniteHadoopUtils.getQueryOrProcessing(cmr.query, InfiniteHadoopUtils.QuerySpec.POSTPROC));
        if (postProcObject != null) {
            sortField = postProcObject.getString("sortField", "_id");
            sortDir = postProcObject.getInt("sortDirection", 1);
        } //TESTED (post proc and no post proc)
        sort = new BasicDBObject(sortField, sortDir);
    } //TOTEST

    // Case 1: DB
    rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", true,
            "Map reduce job completed at: " + cmr.lastCompletionTime));
    if ((null == cmr.exportToHdfs) || !cmr.exportToHdfs) {
        DBCursor resultCursor = null;
        DBCollection coll = DbManager.getCollection(cmr.getOutputDatabase(), cmr.outputCollection);
        DBDecoderFactory defaultDecoder = coll.getDBDecoderFactory();
        CsvGeneratingBsonDecoder csvDecoder = null;
        SizeReportingBasicBSONDecoder sizeDecoder = null;
        CustomMapReduceResultPojo cmrr = new CustomMapReduceResultPojo();
        try {
            if (bCsv) {
                coll.setDBDecoderFactory((csvDecoder = new CsvGeneratingBsonDecoder()));
            } else {
                coll.setDBDecoderFactory((sizeDecoder = new SizeReportingBasicBSONDecoder()));
            }
            if (limit > 0) {
                resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort).limit(limit);
            } else {
                resultCursor = coll.find(queryDbo, fieldsDbo).sort(sort);
            }
            LinkedList<BasicDBObject> list = null;
            if (!bCsv) {
                list = new LinkedList<BasicDBObject>();
            }
            final int MAX_SIZE_CSV = 80 * 1024 * 1024; //(80MB)
            final int MAX_SIZE_JSON = 80 * 1024 * 1024; //(80MB)
            while (resultCursor.hasNext()) {
                BasicDBObject x = (BasicDBObject) resultCursor.next();
                if (!bCsv) {
                    list.add(x);
                }
                if (null != csvDecoder) {
                    if (csvDecoder.getCsv().length() > MAX_SIZE_CSV) {
                        break;
                    }
                } else if (null != sizeDecoder) {
                    if (sizeDecoder.getSize() > MAX_SIZE_JSON) {
                        break;
                    }
                }
            }
            cmrr.results = list;
        } finally {
            coll.setDBDecoderFactory(defaultDecoder);
        }
        cmrr.lastCompletionTime = cmr.lastCompletionTime;
        if (null != csvDecoder) {
            StringBuffer header = new StringBuffer();
            for (String field : csvDecoder.getOrderedFields()) {
                if (0 != header.length()) {
                    header.append(',');
                }
                header.append('"');
                header.append(field.replace("\"", "\\\""));
                header.append("\"");
            }
            header.append('\n');
            header.append(csvDecoder.getCsv().toString());
            cmrr.results = header.toString();
        }
        rp.setData(cmrr);
    } //TESTED
    else { // Case 2: HDFS

        if ((null != cmr.outputKey) && (null != cmr.outputValue)
                && cmr.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
                && cmr.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
            // special case, text file
            try {
                rp.setData(HadoopUtils.getBsonFromTextFiles(cmr, limit, fields),
                        (BasePojoApiMap<BasicDBList>) null);
            } catch (Exception e) {
                rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false,
                        "Files don't appear to be in text file format, did you run the job before changing the output to Text/Text?"));
            }
        } //TESTED
        else { // sequence file
            try {
                rp.setData(HadoopUtils.getBsonFromSequenceFile(cmr, limit, fields),
                        (BasePojoApiMap<BasicDBList>) null);
            } catch (Exception e) {
                rp.setResponse(new ResponseObject("Custom Map Reduce Job Results", false,
                        "Files don't appear to be in sequence file format, did you run the job with Text/Text?"));
            }
        } //TESTED
    } //TESTED      
}