DemandSparkcontext Custom extension textfiles, support for entering text files from multiple directories
Extended
classSparkcontext (Pyspark. Sparkcontext):def __init__(Self, Master=none, Appname=none, Sparkhome=none, Pyfiles=none, Environment=none, batchsize=0, serializer= Pickleserializer (), Conf=none, Gateway=none, jsc=None): Pyspark. Sparkcontext.__init__(Self, master=master, Appname=appname, Sparkhome=sparkhome, pyfiles=pyfiles, Environment=environment, Batchsize=batchsize, Serializer=serializer, conf=conf, Gateway=gateway, jsc=JSC)deftextfiles (Self, dirs): hadoopconf= {"Mapreduce.input.fileinputformat.inputdir":",". Join (dirs),"mapreduce.input.fileinputformat.input.dir.recursive":"true"} pair= Self.hadooprdd (inputformatclass="Org.apache.hadoop.mapred.TextInputFormat", Keyclass="org.apache.hadoop.io.LongWritable", valueclass="Org.apache.hadoop.io.Text", conf=hadoopconf) Text= Pair.map (LambdaPair:pair[1]) returnText
Example
fromPysparkImportsparkconf fromDip.sparkImportSparkcontext conf= Sparkconf (). Setappname ("spark_textfiles_test") SC= Sparkcontext (conf=conf) dirs= ["Hdfs://dip.cdh5.dev:8020/user/yurun/dir1", "Hdfs://dip.cdh5.dev:8020/user/yurun/dir2"] defPrintlines (lines):iflines: forLineinchlines:PrintLine Lines=sc.textfiles (dirs). Collect () Printlines (lines) sc.stop ()
Sparkcontext Custom extension textfiles, support for entering text files from multiple directories