mongodb mapreduce小試

來源:互聯網
上載者:User

最近由於產品業務的需求,需要使用一些資料量比較相對有點大的計算,順便試試mongodb的mapreduce功能,感覺還不錯

 

下面是官方提供的一個例子:

$ ./mongo> db.things.insert( { _id : 1, tags : ['dog', 'cat'] } );> db.things.insert( { _id : 2, tags : ['cat'] } );> db.things.insert( { _id : 3, tags : ['mouse', 'cat', 'dog'] } );> db.things.insert( { _id : 4, tags : []  } );> // map function> m = function(){...    this.tags.forEach(...        function(z){...            emit( z , { count : 1 } );...        }...    );...};> // reduce function> r = function( key , values ){...    var total = 0;...    for ( var i=0; i<values.length; i++ )...        total += values[i].count;...    return { count : total };...};> res = db.things.mapReduce(m,r);> res{"timeMillis.emit" : 9 , "result" : "mr.things.1254430454.3" , "numObjects" : 4 , "timeMillis" : 9 , "errmsg" : "" , "ok" : 0}> db[res.result].find(){"_id" : "cat" , "value" : {"count" : 3}}{"_id" : "dog" , "value" : {"count" : 2}}{"_id" : "mouse" , "value" : {"count" : 1}} > db[res.result].drop()

mapreduce參數說明

db.runCommand({     mapreduce : <collection>,      map : <mapfunction>,        reduce : <reducefunction>      [, query : <query filter object>]        [, sort : <sort the query.  useful for optimization>]        [, limit : <number of objects to return from collection>]        [, out : <output-collection name>]        [, keeptemp: <true|false>]        [, finalize : <finalizefunction>]        [, scope : <object where fields go into javascript global scope >]        [, verbose : true]  });

    mapreduce:指定要進行mapreduce處理的collection
    map:map函數
    reduce:reduce函數
    query:一個篩選條件,只有滿足條件的行才會加入mapreduce集合,而這個篩選過程是先於整個mapreduce流程而執行的
    sort:和query結合的sort排序參數,這是唯一可以最佳化分組機制的地方
    limit:同上
    out:結果輸出的collection的名字,不指定會預設建立一個隨機名字的collection
    keytemp:true或false,表明結果輸出到的collection是否是臨時的,如果為true,則會在用戶端串連中斷後自動刪除,如果你用的是MongoDB的mongo用戶端串連,那必須exit後才會刪除。如果是指令碼執行,指令碼退出或調用close會自動刪除結果collection
    finalize:和map,reduce一樣是一個函數,它可以在reduce得出一個結果後再對key和value進行一次計算並返回一個最終結果
    scope:設定參數值,在這裡設定的值在map,reduce,finalize函數中可見
    verbose:在執行過程中列印調試資訊。

返回格式:

{ result : <collection_name>,   counts : {input :  <number of objects scanned>, emit  : <number of times emit was called>, output : <number of items in output collection>} ,timeMillis : <job_time>,ok : <1_if_ok>,[, err : <errmsg_if_error>] }

 

 下面來一個略微複雜一點的例子,下面是統計房源列表頁房源的曝光量:

mongodb資料格式:

{ "_id" : ObjectId("50364d9fdec7d5ce4000198d"), "pn" : "Listing_V2_IndexPage_All", "guid" : "E200F425-30E7-0D97-9B3A-E047A08CE47C", "uguid" : "4455754C-B2A0-7EDA-6387-A50F0228DE7F", "url" : "http://shanghai.haozu.com/listing/pudong/?from=in_area", "referer" : "http://shanghai.haozu.com/", "site" : "haozu", "stamp" : "1345691212948", "cip" : "116.231.123.184", "sessid" : "B1197AA0-976C-F6EF-BB6F-9401D8E983DD", "cid" : "11", "cstamp" : "1345691178421", "cstparam" : "{\"found\":\"37695\",\"proids\":\"10290023|10353348|8448223|10310737|10311720|10250125|10320886|8507299|10332158|10341287|10266002|10322302|9185878|10273552|10272872|10282252|10270250|10336122|9350169|10196350|8533446|10250019|10335617|10222489\"}", "rfpn" : "Home_Index8Page", "agent" : "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; 360SE; 360SE)" }

房源id儲存在cstparam欄位裡面,是一個字串,因此需要正則進行一下匹配,然後取出進行統計
,因此對應的map,reduce的寫法為:

map方法:

var m=function () {    var arr = this.cstparam.split("\"");    var str_ids = arr[arr.length - 2];    var arr_ids = str_ids.split("|");    for (var i in arr_ids) {        emit(arr_ids[i], 1);    }}

reduce方法:

var reduce=function (key, emits) {    var count = 0;    for (var i in emits) {        count += emits[i];    }    return count;}

 執行:

db.log_soj.mapReduce(map,reduce,{out:'result_tmp',query:{'cstparam':{'$exists':true},'cstparam':/proids/}});

返回結果:

{    "result" : "result_tmp",    "timeMillis" : 18888,    "counts" : {        "input" : 15742,        "emit" : 333011,        "reduce" : 103137,        "output" : 150897    },    "ok" : 1,}

 結果集:

{ "_id" : "10000003", "value" : 1 }{ "_id" : "10000016", "value" : 2 }{ "_id" : "10000032", "value" : 1 }{ "_id" : "10000039", "value" : 1 }{ "_id" : "10000043", "value" : 1 }{ "_id" : "10000059", "value" : 1 }

 

再來一個,和上例類似,但是按照房源所出現的城市進行曝光量的統計

map函數:

function () {    var arr = this.cstparam.split("\"");    var str_ids = arr[arr.length - 2];    var arr_ids = str_ids.split("|");    for (var i in arr_ids) {        var key = arr_ids[i] + "_" + this.cid;        emit(key, {prop_id:arr_ids[i], city_id:this.cid, count:1});    }}

reduce函數:

function (key, emits) {    var total = 0;    for (var i in emits) {        total += emits[i].count;    }    return {prop_id:emits[0].prop_id, city_id:emits[0].city_id, count:total};}

執行:

db.log_soj.mapReduce(m1,r1,{out:'result_tmp',query:{'cstparam':{'$exists':true},'cstparam':/proids/}});

結果:

{ "_id" : "10000003_undefined", "value" : { "prop_id" : "10000003", "city_id" : null, "count" : 1 } }{ "_id" : "10000016_14", "value" : { "prop_id" : "10000016", "city_id" : "14", "count" : 2 } }{ "_id" : "10000032_15", "value" : { "prop_id" : "10000032", "city_id" : "15", "count" : 1 } }{ "_id" : "10000039_15", "value" : { "prop_id" : "10000039", "city_id" : "15", "count" : 1 } }{ "_id" : "10000043_11", "value" : { "prop_id" : "10000043", "city_id" : "11", "count" : 1 } }{ "_id" : "10000059_17", "value" : { "prop_id" : "10000059", "city_id" : "17", "count" : 1 } }{ "_id" : "10000068_11", "value" : { "prop_id" : "10000068", "city_id" : "11", "count" : 1 } }{ "_id" : "10000099_15", "value" : { "prop_id" : "10000099", "city_id" : "15", "count" : 1 } }{ "_id" : "10000100_18", "value" : { "prop_id" : "10000100", "city_id" : "18", "count" : 1 } }{ "_id" : "10000106_14", "value" : { "prop_id" : "10000106", "city_id" : "14", "count" : 1 } }{ "_id" : "10000109_18", "value" : { "prop_id" : "10000109", "city_id" : "18", "count" : 3 } }{ "_id" : "10000112_15", "value" : { "prop_id" : "10000112", "city_id" : "15", "count" : 1 } }{ "_id" : "10000118_15", "value" : { "prop_id" : "10000118", "city_id" : "15", "count" : 1 } }{ "_id" : "10000156_11", "value" : { "prop_id" : "10000156", "city_id" : "11", "count" : 1 } }{ "_id" : "10000224_14", "value" : { "prop_id" : "10000224", "city_id" : "14", "count" : 1 } }{ "_id" : "10000250_22", "value" : { "prop_id" : "10000250", "city_id" : "22", "count" : 1 } }{ "_id" : "10000262_25", "value" : { "prop_id" : "10000262", "city_id" : "25", "count" : 1 } }{ "_id" : "10000267_14", "value" : { "prop_id" : "10000267", "city_id" : "14", "count" : 3 } }{ "_id" : "10000305_14", "value" : { "prop_id" : "10000305", "city_id" : "14", "count" : 3 } }{ "_id" : "10000323_11", "value" : { "prop_id" : "10000323", "city_id" : "11", "count" : 1 } }

轉載請註明出處:

http://www.cnblogs.com/xiazh/archive/2012/09/05/2671730.html

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.