Sparkcontext is the portal to spark, which connects clusters, creates RDD, broadcasts variables, and more.
classSparkcontext (config:sparkconf)extendsLogging with executorallocationclient {PrivateVal Creationsite:callsite =Utils.getcallsite ()//If you live with 2 Sparkcontext, you will use warn to replace exception. Prevent exit PrivateVal Allowmultiplecontexts:boolean =Config.getboolean ("Spark.driver.allowMultipleContexts",false).. Prevents two sparkcontext from running at the same time sparkcontext.markpartiallyconstructed ( This, allowmultiplecontexts)Private[Spark] var preferrednodelocationdata:map[string, set[splitinfo]] =Map () Val startTime=System.currenttimemillis ()//The system environment variable is loaded when the commit task executes Spark-submitDef This() = This(Newsparkconf ()) def This(Master:string, appname:string, conf:sparkconf) = This(sparkcontext.updatedconf (conf, master, AppName))//Preferrednodelocationdata is used to start the lookup nodes and start the corresponding containerDef This(master:string, appname:string, sparkhome:string=NULL, jars:seq[string]=Nil, environment:map[string, String]=Map (), preferrednodelocationdata:map[string, Set[splitinfo]]= Map ()) = { This(Sparkcontext.updatedconf (Newsparkconf (), Master, AppName, Sparkhome, Jars, Environment)) if(preferrednodelocationdata.nonempty) {logwarning ("Passing in preferred locations have no effect at all, see SPARK-8949") } This. Preferrednodelocationdata =Preferrednodelocationdata//constructor Function Private[Spark] Def This(master:string, appname:string) = This(Master, AppName,NULL, Nil, Map (), map ())Private[Spark] Def This(Master:string, appname:string, sparkhome:string) = This(Master, AppName, Sparkhome, Nil, Map (), map ())Private[Spark] Def This(Master:string, Appname:string, sparkhome:string, jars:seq[string]) = This(Master, AppName, Sparkhome, Jars, map (), map ())Private[Spark] def conf:sparkconf =_conf//Clone Conf, then cannot be modified at run timedef getconf:sparkconf =Conf.clone () def Jars:seq[string]=_jars def files:seq[string]=_files def master:string= _conf.get ("Spark.master") def appname:string= _conf.get ("Spark.app.name") Private[Spark] def Iseventlogenabled:boolean = _conf.getboolean ("spark.eventLog.enabled",false) Private[Spark] def eventlogdir:option[uri] =_eventlogdirPrivate[Spark] def eventlogcodec:option[string] =_eventlogcodec//Create SchedularVal (sched, ts) = Sparkcontext.createtaskscheduler ( This, master) _schedulerbackend=sched _taskscheduler=TS _dagscheduler=NewDagscheduler ( This) _heartbeatreceiver.ask[boolean] (taskschedulerisset)//Start Taskschedular_taskscheduler.start () ApplicationID=_taskscheduler.applicationid () _applicationattemptid=Taskscheduler.applicationattemptid () _conf.set ("Spark.app.id", _applicationid) _env.blockmanager.initialize (_applicationid)//Create a new rdd with step to add elementsdef range (Start:long, End:long, Step:long= 1, Numslices:int= defaultparallelism): Rdd[long] =Withscope {assertnotstopped ()//When step is 0, range would run infinitelyRequire (step! = 0, "Step cannot be 0") Val Numelements:bigint={val SafeStart=BigInt (Start) Val safeend=BigInt (end)if((safeend-safestart)% step = = 0 | | safeend > SafeStart ^ Step > 0) {(Safeend-SafeStart)/Step}Else{(Safeend-SafeStart)/step + 1}} parallelize (0 until Numslices, numslices). Mappartitionswithindex ((i, _) = ={val Partitionstart= (i * numelements)/numslices * step +start Val partitionend= ((((i + 1) * numelements)/numslices) * Step +start def getsafemargin (bi:bigint): Long=if(Bi.isvalidlong) {Bi.tolong}Else if(Bi > 0) {Long.maxvalue}Else{Long.minvalue} val safepartitionstart=Getsafemargin (Partitionstart) Val safepartitionend=Getsafemargin (partitionend)NewIterator[long] {Private[ This] var Number:long =SafepartitionstartPrivate[ This] var Overflow:boolean =falseOverride def Hasnext=if(!overflow) { if(Step > 0) { number<Safepartitionend}Else{ number>Safepartitionend}} Else falseoverride Def next ()={val ret= Number Number+=Stepif(Number < ret ^ Step < 0) {Overflow=true} (Ret}})}//Create an Rdddef Makerdd[t:classtag] (seq:seq[t], Numslices:int= defaultparallelism): rdd[t] =withscope {parallelize (seq, numslices)}//reads local, HDFS files, returns a string of stringsdef textfile (path:string, Minpartitions:int= defaultminpartitions): rdd[string] =Withscope {assertnotstopped () hadoopfile (path, Classof[textinputformat], classof[longwritable], ClassOf[Text], minpartitions). Map (Pair=pair._2.tostring)}//load a binary file,@Experimental def binaryrecords (path:string, Recordlength:int, Conf:configuration= hadoopconfiguration): rdd[array[byte]] =Withscope {assertnotstopped () conf.setint (Fixedlengthbinaryinputformat.record_length_property, RecordLength) Val BR=newapihadoopfile[longwritable, Byteswritable, Fixedlengthbinaryinputformat] (path, classof[ Fixedlengthbinaryinputformat], classof[longwritable], classof[byteswritable], conf=conf) Val Data= Br.map { Case(k, v) = =Val bytes=v.getbytesassert(Bytes.length = = RecordLength, "Byte array does not having correct length") bytes} data}//get an RDD for the given key-value pair type for Hadoop sequencefiledef Sequencefile[k, V] (path:string, keyclass:class[k], valueclass:class[v], Minpartitions:int ): rdd[(K, V)]=Withscope {assertnotstopped () Val inputformatclass=classof[sequencefileinputformat[k, V]] hadoopfile (path, Inputformatclass, Keyclass, Valueclass, minPartitions)}< /c0>//1300 send a broadcast variable to each node of the clusterdef Broadcast[t:classtag] (value:t): broadcast[t] ={assertnotstopped ()if(Classof[rdd[_]].isassignablefrom (Classtag[t].runtimeclass)) {logwarning ("Can not directly broadcast RDDs; Instead, call collect () and "+" broadcast the result (see SPARK-5063) ")} val BC=Env.broadcastmanager.newbroadcast[t] (value, isLocal) Val callSite=getcallsite Loginfo ("Created Broadcast" + Bc.id + "from" +callsite.shortform) Cleaner.foreach (_.registerbroadcastforcleanup (BC)) BC}
Sparkcontext Source Reading