map:
def map(): sc = SparkContext("spark://node0:7077", "map") list=[1,2,3,4,5] listRdd=sc.parallelize(list) listmap =listRdd.map(lambda s:s*2) print listmap.collect() sc.stop()
filter
def filter(): sc = SparkContext("spark://node0:7077", "filter") list = [1, 2, 3, 4, 5,6,7,8,9,10] listRdd = sc.parallelize(list) result = listRdd.filter(lambda x: x % 2 == 0) print result.collect() sc.stop()
flatMap:
def flatMap(): sc = SparkContext("spark://node0:7077", "flatMap") line = ["hello you","hello me","hello world"] lineRdd =sc.parallelize(line) result =lineRdd.flatMap(lambda s :s.split(" ")) print result.collect() sc.stop()
groupbykey:
def groupbyKey(): sc = SparkContext("spark://node0:7077", "groupbyKey") listtest = [("class1",80),("class2",75),("class1",90),("class2",65)] listRdd = sc.parallelize(listtest) result = listRdd.groupByKey(); print result.map(lambda x : (x[0], list(x[1]))).collect(); sc.stop
reducebykey:
def reduceByKey(): sc = SparkContext("spark://node0:7077", "reduceByKey") listtest = [("class1", 80), ("class2", 75), ("class1", 90), ("class2", 65)] listRdd = sc.parallelize(listtest) result = listRdd.reduceByKey(lambda x,y:x+y); print result.collect() sc.stop
sortbykey:
def sortByKey(): sc = SparkContext("spark://node0:7077", "sortByKey") listtest = [(65,"ieo"),(50,"tom"),(100,"marry"),(80,"jack")] listRdd = sc.parallelize(listtest) result = listRdd.sortByKey() print result.collect() sc.stop
join:
def join(): sc = SparkContext("spark://node0:7077", "join") studentlist = [(1, "leo"),(2, "jack"),(3, "tom")] scorelist=[(1, 100),(2, 90),(3, 60)] students = sc.parallelize(studentlist) scores = sc.parallelize(scorelist) result = students.join(scores) print result.collect() sc.stop
cogroup:
def cogroup(): sc = SparkContext("spark://node0:7077", "cogroup") studentlist = [(1, "leo"),(2, "jack"),(3, "tom")] scorelist=[(1, 100),(2, 90),(3, 60),(1, 70),(2, 80),(3, 50)] students = sc.parallelize(studentlist) scores = sc.parallelize(scorelist) result = students.cogroup(scores) results = result.map(lambda x: (x[0],list(x[1][0]),list(x[1][1]))) print results.collect() sc.stop
全檔案:
import osimport sysos.environ['SPARK_HOME'] = '/opt/spark'sys.path.append("/opt/spark/python")from pyspark import SparkContextfrom pyspark import SparkConfdef map(): sc = SparkContext("spark://node0:7077", "map") list=[1,2,3,4,5] listRdd=sc.parallelize(list) listmap =listRdd.map(lambda s:s*2) print listmap.collect() sc.stop()def filter(): sc = SparkContext("spark://node0:7077", "filter") list = [1, 2, 3, 4, 5,6,7,8,9,10] listRdd = sc.parallelize(list) result = listRdd.filter(lambda x: x % 2 == 0) print result.collect() sc.stop()def flatMap(): sc = SparkContext("spark://node0:7077", "flatMap") line = ["hello you","hello me","hello world"] lineRdd =sc.parallelize(line) result =lineRdd.flatMap(lambda s :s.split(" ")) print result.collect() sc.stop()def groupbyKey(): sc = SparkContext("spark://node0:7077", "groupbyKey") listtest = [("class1",80),("class2",75),("class1",90),("class2",65)] listRdd = sc.parallelize(listtest) result = listRdd.groupByKey(); print result.map(lambda x : (x[0], list(x[1]))).collect(); sc.stopdef reduceByKey(): sc = SparkContext("spark://node0:7077", "reduceByKey") listtest = [("class1", 80), ("class2", 75), ("class1", 90), ("class2", 65)] listRdd = sc.parallelize(listtest) result = listRdd.reduceByKey(lambda x,y:x+y); print result.collect() sc.stopdef sortByKey(): sc = SparkContext("spark://node0:7077", "sortByKey") listtest = [(65,"ieo"),(50,"tom"),(100,"marry"),(80,"jack")] listRdd = sc.parallelize(listtest) result = listRdd.sortByKey() print result.collect() sc.stopdef join(): sc = SparkContext("spark://node0:7077", "join") studentlist = [(1, "leo"),(2, "jack"),(3, "tom")] scorelist=[(1, 100),(2, 90),(3, 60)] students = sc.parallelize(studentlist) scores = sc.parallelize(scorelist) result = students.join(scores) print result.collect() sc.stopdef cogroup(): sc = SparkContext("spark://node0:7077", "cogroup") studentlist = [(1, "leo"),(2, "jack"),(3, "tom")] scorelist=[(1, 100),(2, 90),(3, 60),(1, 70),(2, 80),(3, 50)] students = sc.parallelize(studentlist) scores = sc.parallelize(scorelist) result = students.cogroup(scores) results = result.map(lambda x: (x[0],list(x[1][0]),list(x[1][1]))) print results.collect() sc.stopif __name__ == '__main__': # map() # filter() # flatMap() #groupbyKey() #reduceByKey() #sortByKey() #join() cogroup()