#-*-coding:utf-8-*-#!/bin/env Python3#filename readfromkafkastreaminggetlocation.pyImportIP fromPysparkImportSparkcontext fromPyspark.streamingImportStreamingContext fromPyspark.streaming.kafkaImportkafkautilsImportdatetimeclassKafkamessageparse:defExtractfromkafka (self,kafkainfo):ifType (kafkainfo) isTuple andLen (kafkainfo) = = 2: returnKafkainfo[1] defLinefromlines (self,lines):ifLines is notNone andLen (lines) >0:returnLines.strip (). Split ("\ n") defMessagefromline (self,line):ifLine is notNone and "message" inchLine.keys ():returnLine.get ("message") defip2location (SELF,IP): Result=[] Country='Country'Province='Province' City=' City'Ipinfo=Ip.find (Ip.strip ())Try: Location= Ipinfo.split ("\ t") ifLen (location) = = 3: Country=Location[0] Province= Location[1] City= Location[2] elifLen (location) = = 2: Country=Location[0] Province= Location[1] Else: Pass exceptException:Passresult.append (IP) result.append (country) result.append (province) result.append (city) returnresultdefvlistfromkv (self, Strori, SEP1, SEP2): Resultlist=[] fields=strori.split (SEP1) forFieldinchfields:kv=field.split (SEP2) resultlist.append (kv[1]) returnresultlistdefextractfrommessage (self, message):ifMessage is notNone andLen (message) > 1: ifLen (Message.split ("\u0001")) = = 8: Resultlist= self.vlistfromkv (Message,"\x01","\x02") Source=resultlist.pop () IP=Resultlist.pop () resultlist.extend (Self.ip2location (IP)) resultlist.append (source) Result="\x01". Join (resultlist)returnresultdef tpprint (Val, num=10000): "" "Print the first num elements of each RDD generated in this DStream. @param num:the number of elements from the first would be printed. "" "Def Takeandprint (Time, RDD): taken = Rdd.take (num + 1) print (" ######################## ") print ("Time:%s"% time) print ("########################") DateFormat = '%y%m%d ' today = Datetime.datetime . Now (). Strftime (dateformat) myfile = open ("/data/speech/speech." + Today, "a") for record in Taken[:num]: Print (record) Myfile.write (str (record) + "\ n") myfile.close () If Len (taken) > num: Print ("...") print ("") Val.foreachrdd (Takeandprint)if __name__=='__main__': Zkquorum='datacollect-1:2181,datacollect-2:2181,datacollect-3:2181'Topic= {'speech-1': 1,'speech-2': 1,'speech-3': 1,'speech-4': 1,'speech-5': 1} groupid="rokid-speech-get-location"Master="local[*]"AppName="Sparkstreamingrokid"Timecell= 5SC= Sparkcontext (Master=master, appname=appName) SSC=StreamingContext (SC, Timecell)#ssc.checkpoint ("Checkpoint_" +time.strftime ("%y-%m-%d", Time.localtime (Time.time () )))KVS=Kafkautils.createstream (SSC, Zkquorum, GroupID, topic) KMP=Kafkamessageparse () lines= Kvs.map (LambdaX:kmp.extractfromkafka (x)) Lines1= Lines.flatmap (Lambdax:kmp.linefromlines (x)) Valuedict= Lines1.map (Lambdax:eval (x)) message= Valuedict.map (Lambdax:kmp.messagefromline (x)) Rdd2= Message.map (Lambdax:kmp.extractfrommessage (x))#Rdd2.pprint ()tpprint (RDD2)#rdd2.fileprint (filepath= "Result.txt") #Rdd2.foreachrdd (). Saveastextfiles ("/home/admin/agent/spark/result.txt") #sc.parallelize (Rdd2.cache ()). Saveastextfile ("/home/admin/agent/spark/result", "TXT") #rdd2.repartition (1). Saveastextfiles ("/home/admin/agent/spark/result.txt")Ssc.start () ssc.awaittermination ( )
Mostly overriding the Pprint () function
Reference: Https://stackoverflow.com/questions/37864526/append-spark-dstream-to-a-single-file-in-python
Sparkstreaming python reads Kafka data to output the results to a single specified local file