1. Description
Although Dstream can be converted to RDD, you can consider using sparksql if it is more complex.
2. Integration method
Streaming and Core integration:
Transform or Foreachrdd method
Core and SQL consolidation:
RDD <==> DataFrame Interchange
3. Procedures
1 Packagecom.sql.it2 ImportOrg.apache.spark.sql.SQLContext3 ImportOrg.apache.spark.storage.StorageLevel4 Importorg.apache.spark.streaming.kafka.KafkaUtils5 Importorg.apache.spark.streaming. {Seconds, StreamingContext}6 ImportOrg.apache.spark. {sparkconf, sparkcontext}7 Object Streamingsql {8def main (args:array[string]): Unit = {9Val conf =Newsparkconf ()Ten. Setappname ("StreamingWindowOfKafka22") One. Setmaster ("local[*]") AVal sc =sparkcontext.getorcreate (conf) -Val SSC =NewStreamingContext (SC, Seconds (5)) - //when calling the Updatestatebykey function API, the checkpoint dir must be given the //the folder corresponding to the path cannot exist -Ssc.checkpoint ("hdfs://linux-hadoop01.ibeifeng.com:8020/beifeng/spark/streaming/9421151351") - -Val Kafkaparams =Map ( +"Group.id", "streaming-kafka-78912151", -"Zookeeper.connect", "Linux-hadoop01.ibeifeng.com:2181/kafka", +"Auto.offset.reset", "smallest" A ) atVal topics = Map ("Beifeng", 4)//value in topics is the number of threads that read the data, so must be greater than or equal to 1 -Val Dstream =kafkautils.createstream[string, String, Kafka.serializer.StringDecoder, Kafka.serializer.StringDecoder] ( -SSc//given sparkstreaming context -Kafkaparams,//parameter information for a given connection Kafka ===> via Kafka Highlevelconsumerapi connection -Topics//given the name of the corresponding topic and the number of threads that read the data -Storagelevel.memory_and_disk_2//Specifies the level of storage saved after data sinks receive Kafka data in ). Map (_._2) - to /** + * Transform: Convert the Dstream operation to an RDD operation, the API call will eventually only need to return a new Rdd - */ theDstream.transform (Rdd = { * //using SQL Statistics Wordcoount $Val SqlContext =Sqlcontextsingelton.getsqlcontext (Rdd.sparkcontext)Panax Notoginseng Importsqlcontext.implicits._ -Val Procedrdd = Rdd.filter (_.nonempty). FlatMap (_.split (""). Map ((_, 1))) thePROCEDRDD.TODF ("word", "C"). Registertemptable ("Tb_word") +Val Resultrdd = sqlcontext.sql ("Select Word, Count (c) as VC from Tb_word Group by word"). Map (row = { AVal Word = row.getas[string] ("word") theVal count = Row.getas[long] ("VC") + (Word, count) - }) $ $ Resultrdd - }). Print () - the //Start processing started - Ssc.start ()WuyiSsc.awaittermination ()//and so on, to monitor the interrupt operation of a thread. the } - } Wu - Object Sqlcontextsingelton { About@transient Privatevar instance:sqlcontext = _ $ -def getsqlcontext (sc:sparkcontext): SqlContext = { - if(Instance = =NULL) { - synchronized[SqlContext] { A if(Instance = =NULL) { +Instance =NewSqlContext (SC) the } - instance $ } the } the instance the } the}
4. Effects
071 sparkstreaming and Sparksql integration