Map
def map ():
sc = sparkcontext ("spark://node0:7077", "map")
list=[1,2,3,4,5]
listrdd=sc.parallelize (list )
Listmap =listrdd.map (lambda s:s*2)
print listmap.collect ()
sc.stop ()
Filter
DEF filter ():
sc = sparkcontext ("spark://node0:7077", "filter")
list = [1, 2, 3, 4, 5,6,7,8,9,10]
Listrdd = SC . parallelize (list) Result
= Listrdd.filter (lambda x:x% 2 = 0)
print result.collect ()
sc.stop ()
Flatmap:
Def flatmap ():
sc = sparkcontext ("spark://node0:7077", "Flatmap") Line
= ["Hello", "Hello Me", "Hello World "]
Linerdd =sc.parallelize (line) result
=linerdd.flatmap (Lambda s:s.split ("))
print Result.collect ()
sc.stop ()
Groupbykey:
Def groupbykey ():
sc = sparkcontext ("spark://node0:7077", "Groupbykey")
listtest = [("Class1",), ("Class2", ("Class1",), ("Class2",)]
Listrdd = sc.parallelize (listtest) Result
= Listrdd.groupbykey ();
Print Result.map (lambda x: (x[0], list (x[1))). Collect ();
Sc.stop
Reducebykey:
Def reducebykey ():
sc = sparkcontext ("spark://node0:7077", "Reducebykey")
listtest = [("Class1",), ("Class2 "," ("Class1"), ("Class2",)]
Listrdd = sc.parallelize (listtest) Result
= Listrdd.reducebykey (lambda x , y:x+y);
Print Result.collect ()
sc.stop
Sortbykey:
Def sortbykey ():
sc = sparkcontext ("spark://node0:7077", "Sortbykey")
listtest = [(+, "Ieo"), (100 , "Marry"), (+, "Jack")]
Listrdd = sc.parallelize (listtest) Result
= Listrdd.sortbykey ()
Print Result.collect ()
sc.stop
Join
def join ():
sc = sparkcontext ("spark://node0:7077", "join")
studentlist = [(1, "Leo"), (2, "Jack"), (3, "Tom")]< c8/>scorelist=[(1), (2), (3)]
students = sc.parallelize (studentlist)
scores = Sc.parallelize ( Scorelist) Result
= Students.join (scores)
print Result.collect ()
sc.stop
Cogroup:
Def cogroup ():
sc = sparkcontext ("spark://node0:7077", "Cogroup")
studentlist = [(1, "Leo"), (2, "Jack"), (3, " Tom ")]
scorelist=[(1), (2), (3), (1), (2), (3)]
students = sc.parallelize (studentlist)
scores = sc.parallelize (scorelist) Result
= Students.cogroup (scores)
results = Result.map (lambda x: (x[0) , List (x[1][0]), List (x[1][1]))
print results.collect ()
sc.stop
Full file:
Import OS import sys os.environ[' spark_home '] = '/opt/spark ' sys.path.append ("/opt/spark/python") from Pyspark import Spa
Rkcontext from Pyspark import sparkconf def map (): sc = sparkcontext ("spark://node0:7077", "map") list=[1,2,3,4,5] Listrdd=sc.parallelize (list) Listmap =listrdd.map (lambda s:s*2) print listmap.collect () sc.stop () def fil ter (): sc = sparkcontext ("spark://node0:7077", "filter") list = [1, 2, 3, 4, 5,6,7,8,9,10] Listrdd = Sc.parall
Elize (list) result = Listrdd.filter (lambda x:x% 2 = 0) print result.collect () sc.stop () def flatmap (): sc = sparkcontext ("spark://node0:7077", "flatmap") line = ["Hello to", "Hello Me", "Hello World"] Linerdd =sc.paral Lelize (line) result =linerdd.flatmap (Lambda s:s.split ("")) print Result.collect () sc.stop () def groupbykey () : sc = sparkcontext ("spark://node0:7077", "Groupbykey") listtest = [("Class1",), ("Class2",), ("Class1",), ("Cl Ass2 ",)] Listrdd = Sc.parallelize (listtest) result = Listrdd.groupbykey ();
Print Result.map (lambda x: (x[0], list (x[1))). Collect (); Sc.stop def reducebykey (): sc = sparkcontext ("spark://node0:7077", "Reducebykey") listtest = [("Class1",), ("Cl Ass2 "," ("Class1"), ("Class2",)] Listrdd = sc.parallelize (listtest) result = Listrdd.reducebykey (lambda
X,y:x+y); Print Result.collect () sc.stop def sortbykey (): sc = sparkcontext ("spark://node0:7077", "Sortbykey") listtest = [("Ieo"), (M, "Tom"), (+, "Marry"), (+, "Jack")] Listrdd = sc.parallelize (listtest) result = Listrdd.sortbykey ( Print Result.collect () Sc.stop def join (): sc = sparkcontext ("spark://node0:7077", "join") Studentlist =
[(1, "Leo"), (2, "Jack"), (3, "Tom")] scorelist=[(1), (2,), (3)] students = sc.parallelize (studentlist) Scores = Sc.parallelize (scorelist) result = Students.join (scores) print result.collect () sc.stop def Cogrou P (): sc = sparkcontext ("spark://node0:7077", "Cogroup") studentlist = [(1, "Leo"), (2, "Jack"), (3, "Tom")] Scoreli st=[(1), (2, M), (3), (1), (2), (3)] students = Sc.parallelize (studentlist) scores = Sc.paralleli
Ze (scorelist) result = Students.cogroup (scores) results = Result.map (lambda x: (X[0],list (x[1][0)), List (x[1][1))) Print Results.collect () sc.stop if __name__ = = ' __main__ ': # map () # filter () # Flatmap () #groupbyKey () #reduceByKey () #sortByKey () #join () () Cogroup ()