One, the item writes
Import Scrapy
class Gzweatheritem (scrapy. Item):
# define the fields for your item here like:
# name = Scrapy. Field ()
# title = Scrapy. Field ()
date = Scrapy. Field ()
maxtemp = scrapy. Field ()
mintemp = scrapy. Field ()
weather = scrapy. Field () Wind
= Scrapy. Field () Power
= Scrapy. Field () Pass
Ii. Preparation of pipeline
From scrapy.exceptions import Dropitem
class Gzweatherpipeline (object):
def process_item (self, item, spider ):
if True: Return
item
else:
raise Dropitem (' reason ')
Iii. Preparation of spider
Import scrapy from Gzweather.items Import Gzweatheritem class Ttffspider (scrapy. Spider): # name defines the string of Spider name, unique, must # can generate multiple Spider instance name = ' Tianqi ' # Allowed_domains contains list of domain names allowed to crawl Allowed_domains = [' lishi.tianqi.com '] # start_urls is the URL list # when a specific URL is not specified, Spider will start crawling from the list start_urls = [' Http://lishi.tianqi.com/guan Gzhou/201101.html '] # custom_setting is a dict, when Spider is started, this setting overrides the project-level setting custom_settings = {} # crawler after initializing class, By the class method From_crawler () set # and linked to the Spider Instance Crawler object # Crawler contains a number of components in the project, as a single entry point (such as plug-ins, middleware, signal manager, etc.) # settings is a settin G Instance # Logger is created with the name of spider and can be used to send log information # From_crawler () Set crawler and Setting properties # Start_requests () returns a iterable that contains spider for The first request # for crawling is called only once by Scrapy, and can be implemented as Generator # Make_requests_from_url () returns the Request object # Parse for crawling () default callback function # Parse is responsible for handling respons E and return the processed data and/or follow up URL # spider has the same requirements for the other request's callback functions Def parse (self, Response): # The callback function must return an iteration that contains the request, Dict, or item.
Object Self.logger.info (' A response from%s just arrived! ', Response.url)# In this project has been response Html_reponse # instantiated Selector sel = scrapy. Selector (response) title = Sel.xpath ('//title/text () '). Extract_first () # gzitem[' title '] = title print (' Print output ******* ') print (title) Uls = Sel.xpath ('//div[@class = ' tqtongji2 ']/ul ') # print ' Uls: ', Uls.extract () for index, U
L in Enumerate (ULS): Gzitem = Gzweatheritem () if index = = 0:continue args = Ul.xpath (' Li/text () '). Extract () If Len (args) = = 5:gzitem[' Date '] = Ul.xpath (' Li/a/text () '). Extract () [0] gzitem[' maxtemp '] = args[0] Gzitem [' mintemp '] = args[1] gzitem[' weather '] = args[2] gzitem[' wind ' = args[3] gzitem[' power ' = args[4] Yield Gzitem elif len (args) = = 6:gzitem[' Date '] = Args[0] gzitem[' maxtemp '] = args[1] gzitem[' mintemp '] = args[ 2] gzitem[' weather '] = args[3] gzitem[' wind '] = args[4] gzitem["power" = args[5] Yield Gzitem # for H3 In Response.xpath ('//h3 '). Extract (): # yield Gzweatheritem (date=h3) print (' #### #] Print (' ##### ') # print Sel.xpath ('//div[contains (@id, "Tool_site")]/div[1]/span[1]/a[last ()]/@href '). Extract () For URLs in Sel.xpath ('//div[contains (@id, Tool_site)]/div[1]/span[1]/a[last ()]/@href '). Extract (): Print (URL) # span[@class = "tqxiangqing"/a[2]] # '//a/@href ' yield scrapy. Request (URL, self.parse) pass # log (message[, level, component]) # Use Scrapy.log.msg () method record (log) message # to automatically bring the spider The Name property, which encapsulates the method of sending a log message via logger, backwards-compatible # closed (reason) Spider call # substitution call Signals.connect () to listen for shortcuts to spider_closed signals on shutdown # #from scrapy.cmdline Import Execute #execute () running in the document
Iv. Access to Data
V. Data-processing
# encoding:utf-8
Import pandas as pd
filename = R ' E:\Users\3404\gzweather.csv '
Outpath = R ' E:\Users\3404\ Newgzweather.csv '
if __name__ = = ' __main__ ':
df = pd.read_csv (filename, header=none)
print (' Look first ')
Print (Df.head ())
df.columns = df.loc[0]
df = df.drop (0)
Df.index = df[' Date '].values
df = Df.sort_ Index () Print
(' sort adjusted ')
print (Df.head ())
df = df.drop_duplicates ()
df = df.drop (' date ', Axis=1)
df = Df.dropna (how= ' any ')
print (' last ')
print (Df.head ())
df.to_csv (Outpath)
Six, paint display
# Encoding:utf-8 Import Pandas as PD import NumPy as NP import Matplotlib.pyplot as Plt InputPath = R ' E:\Users\3404\Deskt Op\\newgzweather.csv ' Def converttoint (SERIES11): datalist = [Int (x) for x in Series11.tolist ()] # series = PD. Series (DataList, Series11.index) return datalist if __name__ = ' __main__ ': df = pd.read_csv (InputPath, Header=none,en coding = "Iso-8859-1", names=[' maxtemp ', ' power ', ' mintemp ', ' weather ', ' wind '] series11 = df.loc[' 2011-12-01 ': ' 2011-12 -31 ', ' mintemp '] series12 = df.loc[' 2012-12-01 ': ' 2012-12-31 ', ' mintemp '] series13 = df.loc[' 2013-12-01 ': ' 2013-12-31 ', ' Mintemp '] series14 = df.loc[' 2014-12-01 ': ' 2014-12-31 ', ' mintemp '] series15 = df.loc[' 2015-12-01 ': ' 2015-12-31 ', ' Mintemp '] series11h = df.loc[' info ': ' 2011-12-31 ', ' maxtemp '] series12h = df.loc[' 2012-12-01 ': ' 2012-12-31 ', ' Maxtemp '] series13h = df.loc[' 2013-12-01 ': ' 2013-12-31 ', ' maxtemp '] series14h = df.loc[' 2014-12-01 ': ' 2014-12-31 ', ' Maxtemp '] series15h = df.loc[' 2015-12-01 ': ' 2015-12-31', ' maxtemp '] fig = plt.figure () list11 = Converttoint (series11) list12 = Converttoint (series12) list13 = Converttoint (SERIES13) list14 = Converttoint (series14) list15 = Converttoint (SERIES15) Plt.plot (range (in), list11, label= ') p Lt.plot (list12, label= ') Plt.plot (range (in), list13, label= ' 2013 ') Plt.plot (range (), list14, label= ' 2014 ') Plt.plot (range (in), list15, label= ' 2015 ') Plt.xlabel (' 12-01 to 12-31 ') Plt.ylabel (' Tempature ') plt.title ("Tempa Ture variation in past 5 years ') Plt.legend (loc= ' best ') plt.show () # Series11.plot (style= ' B ') # fig.autofmt_xdate () # Plt.show () # Series12.plot (style= ' B ') # fig.autofmt_xdate () # plt.show () # Series13.plot (style= ' B ') # FIG.AUTOFMT_XD Ate () # plt.show () # Series14.plot (style= ' B ') # fig.autofmt_xdate () # plt.show () # Series15.plot (style= ' B ') # fig.au
Tofmt_xdate () # plt.show () M11 = Np.array (list11). Mean () M12 = Np.array (list12). Mean () M13 = Np.array (list13). Mean () M14 = Np.array (list14). MeaN () M15 = Np.array (list15). Mean () Meantemps = [M11, M12, M13, M14, M15] m11h = Np.array (Converttoint (series11h)). Mean () -M11 m12h = Np.array (Converttoint (series12h)). Mean ()-M12 m13h = Np.array (Converttoint (series13h)). Mean ()-M13 m14h = Np.array (Converttoint (series14h)). Mean ()-M14 m15h = Np.array (Converttoint (series15h)). Mean ()-M15 meantemphs = [M1 1h, m12h, m13h, m14h, m15h] std11 = Np.array (list11). STD () std12 = Np.array (list12). STD () std13 = Np.array (list13). STD ( ) Std14 = Np.array (list14). STD () std15 = Np.array (list15). STD () stdtemps = [Std11, std12, Std13, Std14, Std15] std11h = Np.array (Converttoint (series11h)). STD () std12h = Np.array (Converttoint (series12h)). STD () std13h = Np.array ( Converttoint (SERIES13H)). STD () std14h = Np.array (Converttoint (series14h)). STD () std15h = Np.array (Converttoint ( series15h)). STD () stdtemphs = [std11h, std12h, Std13h, std14h, std15h] ind = Np.arange (5) width = 0.35 P1 = Plt.bar (in D, Meantemps, width, color= ' R ', yerr=stdtemps) P2 = Plt.bar (Ind, meantemphs, Width, color= ' y ', Bottom=meantemps, yerr=stdtemphs) Plt.ylabel (' Tempature ') plt.title (' mean of mintempature and mean of maxtempature in past 5 years ') Plt.xticks (Ind + WIDTH/2., (' 2011 ', ' 2012 ') , ' 2013 ', ' 2014 ', ' 2015 ') Plt.legend ((p1[0), p2[0]), (' Mintempature ', ' delttempature ')) Plt.show ()