spark 2.2.0 各個計算因子的使用方法 python版__python

來源:互聯網
上載者:User
map:
def map():    sc = SparkContext("spark://node0:7077", "map")    list=[1,2,3,4,5]    listRdd=sc.parallelize(list)    listmap =listRdd.map(lambda s:s*2)    print listmap.collect()    sc.stop()

filter

def filter():    sc = SparkContext("spark://node0:7077", "filter")    list = [1, 2, 3, 4, 5,6,7,8,9,10]    listRdd = sc.parallelize(list)    result = listRdd.filter(lambda x: x % 2 == 0)    print result.collect()    sc.stop()

flatMap:

def flatMap():    sc = SparkContext("spark://node0:7077", "flatMap")    line = ["hello you","hello me","hello world"]    lineRdd =sc.parallelize(line)    result =lineRdd.flatMap(lambda s :s.split(" "))    print result.collect()    sc.stop()
groupbykey:

def groupbyKey():    sc = SparkContext("spark://node0:7077", "groupbyKey")    listtest = [("class1",80),("class2",75),("class1",90),("class2",65)]    listRdd = sc.parallelize(listtest)    result = listRdd.groupByKey();    print result.map(lambda x : (x[0], list(x[1]))).collect();    sc.stop
reducebykey:

def reduceByKey():    sc = SparkContext("spark://node0:7077", "reduceByKey")    listtest = [("class1", 80), ("class2", 75), ("class1", 90), ("class2", 65)]    listRdd = sc.parallelize(listtest)    result = listRdd.reduceByKey(lambda x,y:x+y);    print result.collect()    sc.stop
sortbykey:

def sortByKey():    sc = SparkContext("spark://node0:7077", "sortByKey")    listtest = [(65,"ieo"),(50,"tom"),(100,"marry"),(80,"jack")]    listRdd = sc.parallelize(listtest)    result = listRdd.sortByKey()    print result.collect()    sc.stop
join:

def join():    sc = SparkContext("spark://node0:7077", "join")    studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]    scorelist=[(1, 100),(2, 90),(3, 60)]    students = sc.parallelize(studentlist)    scores = sc.parallelize(scorelist)    result = students.join(scores)    print result.collect()    sc.stop
cogroup:

def cogroup():    sc = SparkContext("spark://node0:7077", "cogroup")    studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]    scorelist=[(1, 100),(2, 90),(3, 60),(1, 70),(2, 80),(3, 50)]    students = sc.parallelize(studentlist)    scores = sc.parallelize(scorelist)    result = students.cogroup(scores)    results = result.map(lambda x: (x[0],list(x[1][0]),list(x[1][1])))    print results.collect()    sc.stop
全檔案:

import osimport sysos.environ['SPARK_HOME'] = '/opt/spark'sys.path.append("/opt/spark/python")from pyspark import SparkContextfrom pyspark import SparkConfdef map():    sc = SparkContext("spark://node0:7077", "map")    list=[1,2,3,4,5]    listRdd=sc.parallelize(list)    listmap =listRdd.map(lambda s:s*2)    print listmap.collect()    sc.stop()def filter():    sc = SparkContext("spark://node0:7077", "filter")    list = [1, 2, 3, 4, 5,6,7,8,9,10]    listRdd = sc.parallelize(list)    result = listRdd.filter(lambda x: x % 2 == 0)    print result.collect()    sc.stop()def flatMap():    sc = SparkContext("spark://node0:7077", "flatMap")    line = ["hello you","hello me","hello world"]    lineRdd =sc.parallelize(line)    result =lineRdd.flatMap(lambda s :s.split(" "))    print result.collect()    sc.stop()def groupbyKey():    sc = SparkContext("spark://node0:7077", "groupbyKey")    listtest = [("class1",80),("class2",75),("class1",90),("class2",65)]    listRdd = sc.parallelize(listtest)    result = listRdd.groupByKey();    print result.map(lambda x : (x[0], list(x[1]))).collect();    sc.stopdef reduceByKey():    sc = SparkContext("spark://node0:7077", "reduceByKey")    listtest = [("class1", 80), ("class2", 75), ("class1", 90), ("class2", 65)]    listRdd = sc.parallelize(listtest)    result = listRdd.reduceByKey(lambda x,y:x+y);    print result.collect()    sc.stopdef sortByKey():    sc = SparkContext("spark://node0:7077", "sortByKey")    listtest = [(65,"ieo"),(50,"tom"),(100,"marry"),(80,"jack")]    listRdd = sc.parallelize(listtest)    result = listRdd.sortByKey()    print result.collect()    sc.stopdef join():    sc = SparkContext("spark://node0:7077", "join")    studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]    scorelist=[(1, 100),(2, 90),(3, 60)]    students = sc.parallelize(studentlist)    scores = sc.parallelize(scorelist)    result = students.join(scores)    print result.collect()    sc.stopdef cogroup():    sc = SparkContext("spark://node0:7077", "cogroup")    studentlist = [(1, "leo"),(2, "jack"),(3, "tom")]    scorelist=[(1, 100),(2, 90),(3, 60),(1, 70),(2, 80),(3, 50)]    students = sc.parallelize(studentlist)    scores = sc.parallelize(scorelist)    result = students.cogroup(scores)    results = result.map(lambda x: (x[0],list(x[1][0]),list(x[1][1])))    print results.collect()    sc.stopif __name__ == '__main__':   # map()   # filter()   # flatMap()   #groupbyKey()   #reduceByKey()   #sortByKey()   #join()   cogroup()












相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.