Scrapy capture weather data and display

Source: Internet
Author: User
Tags xpath

One, the item writes

Import Scrapy


class Gzweatheritem (scrapy. Item):
    # define the fields for your item here like:
    # name = Scrapy. Field ()
    # title = Scrapy. Field ()
    date = Scrapy. Field ()
    maxtemp = scrapy. Field ()
    mintemp = scrapy. Field ()
    weather = scrapy. Field () Wind
    = Scrapy. Field () Power
    = Scrapy. Field () Pass
    

Ii. Preparation of pipeline

From scrapy.exceptions import Dropitem


class Gzweatherpipeline (object):

	def process_item (self, item, spider ):
		if True: Return
			item
		else:
			raise Dropitem (' reason ')


Iii. Preparation of spider

Import scrapy from Gzweather.items Import Gzweatheritem class Ttffspider (scrapy. Spider): # name defines the string of Spider name, unique, must # can generate multiple Spider instance name = ' Tianqi ' # Allowed_domains contains list of domain names allowed to crawl Allowed_domains = [' lishi.tianqi.com '] # start_urls is the URL list # when a specific URL is not specified, Spider will start crawling from the list start_urls = [' Http://lishi.tianqi.com/guan Gzhou/201101.html '] # custom_setting is a dict, when Spider is started, this setting overrides the project-level setting custom_settings = {} # crawler after initializing class, By the class method From_crawler () set # and linked to the Spider Instance Crawler object # Crawler contains a number of components in the project, as a single entry point (such as plug-ins, middleware, signal manager, etc.) # settings is a settin G Instance # Logger is created with the name of spider and can be used to send log information # From_crawler () Set crawler and Setting properties # Start_requests () returns a iterable that contains spider for The first request # for crawling is called only once by Scrapy, and can be implemented as Generator # Make_requests_from_url () returns the Request object # Parse for crawling () default callback function # Parse is responsible for handling respons E and return the processed data and/or follow up URL # spider has the same requirements for the other request's callback functions Def parse (self, Response): # The callback function must return an iteration that contains the request, Dict, or item.
		Object Self.logger.info (' A response from%s just arrived! ', Response.url)# In this project has been response Html_reponse # instantiated Selector sel = scrapy. Selector (response) title = Sel.xpath ('//title/text () '). Extract_first () # gzitem[' title '] = title print (' Print output ******* ') print (title) Uls = Sel.xpath ('//div[@class = ' tqtongji2 ']/ul ') # print ' Uls: ', Uls.extract () for index, U
			L in Enumerate (ULS): Gzitem = Gzweatheritem () if index = = 0:continue args = Ul.xpath (' Li/text () '). Extract () If Len (args) = = 5:gzitem[' Date '] = Ul.xpath (' Li/a/text () '). Extract () [0] gzitem[' maxtemp '] = args[0] Gzitem  [' mintemp '] = args[1] gzitem[' weather '] = args[2] gzitem[' wind ' = args[3] gzitem[' power ' = args[4] Yield Gzitem elif len (args) = = 6:gzitem[' Date '] = Args[0] gzitem[' maxtemp '] = args[1] gzitem[' mintemp '] = args[ 2] gzitem[' weather '] = args[3] gzitem[' wind '] = args[4] gzitem["power" = args[5] Yield Gzitem # for H3 In Response.xpath ('//h3 '). Extract (): # yield Gzweatheritem (date=h3) print (' #### #] Print (' ##### ') # print Sel.xpath ('//div[contains (@id, "Tool_site")]/div[1]/span[1]/a[last ()]/@href '). Extract ()  For URLs in Sel.xpath ('//div[contains (@id, Tool_site)]/div[1]/span[1]/a[last ()]/@href '). Extract (): Print (URL) # span[@class = "tqxiangqing"/a[2]] # '//a/@href ' yield scrapy. Request (URL, self.parse) pass # log (message[, level, component]) # Use Scrapy.log.msg () method record (log) message # to automatically bring the spider The Name property, which encapsulates the method of sending a log message via logger, backwards-compatible # closed (reason) Spider call # substitution call Signals.connect () to listen for shortcuts to spider_closed signals on shutdown # #from scrapy.cmdline Import Execute #execute () running in the document

Iv. Access to Data


V. Data-processing

# encoding:utf-8
Import pandas as pd
filename = R ' E:\Users\3404\gzweather.csv '
Outpath = R ' E:\Users\3404\ Newgzweather.csv '


if __name__ = = ' __main__ ':
	df = pd.read_csv (filename, header=none)
	print (' Look first ')
	Print (Df.head ())
	df.columns = df.loc[0]
	df = df.drop (0)
	Df.index = df[' Date '].values
	df = Df.sort_ Index () Print
	(' sort adjusted ')
	print (Df.head ())
	df = df.drop_duplicates ()
	df = df.drop (' date ', Axis=1)
	df = Df.dropna (how= ' any ')
	print (' last ')
	print (Df.head ())
	df.to_csv (Outpath)

Six, paint display

# Encoding:utf-8 Import Pandas as PD import NumPy as NP import Matplotlib.pyplot as Plt InputPath = R ' E:\Users\3404\Deskt Op\\newgzweather.csv ' Def converttoint (SERIES11): datalist = [Int (x) for x in Series11.tolist ()] # series = PD. Series (DataList, Series11.index) return datalist if __name__ = ' __main__ ': df = pd.read_csv (InputPath, Header=none,en coding = "Iso-8859-1", names=[' maxtemp ', ' power ', ' mintemp ', ' weather ', ' wind '] series11 = df.loc[' 2011-12-01 ': ' 2011-12 -31 ', ' mintemp '] series12 = df.loc[' 2012-12-01 ': ' 2012-12-31 ', ' mintemp '] series13 = df.loc[' 2013-12-01 ': ' 2013-12-31 ', ' Mintemp '] series14 = df.loc[' 2014-12-01 ': ' 2014-12-31 ', ' mintemp '] series15 = df.loc[' 2015-12-01 ': ' 2015-12-31 ', ' Mintemp '] series11h = df.loc[' info ': ' 2011-12-31 ', ' maxtemp '] series12h = df.loc[' 2012-12-01 ': ' 2012-12-31 ', ' Maxtemp '] series13h = df.loc[' 2013-12-01 ': ' 2013-12-31 ', ' maxtemp '] series14h = df.loc[' 2014-12-01 ': ' 2014-12-31 ', ' Maxtemp '] series15h = df.loc[' 2015-12-01 ': ' 2015-12-31', ' maxtemp '] fig = plt.figure () list11 = Converttoint (series11) list12 = Converttoint (series12) list13 = Converttoint (SERIES13) list14 = Converttoint (series14) list15 = Converttoint (SERIES15) Plt.plot (range (in), list11, label= ') p Lt.plot (list12, label= ') Plt.plot (range (in), list13, label= ' 2013 ') Plt.plot (range (), list14, label= ' 2014 ') Plt.plot (range (in), list15, label= ' 2015 ') Plt.xlabel (' 12-01 to 12-31 ') Plt.ylabel (' Tempature ') plt.title ("Tempa  Ture variation in past 5 years ') Plt.legend (loc= ' best ') plt.show () # Series11.plot (style= ' B ') # fig.autofmt_xdate () # Plt.show () # Series12.plot (style= ' B ') # fig.autofmt_xdate () # plt.show () # Series13.plot (style= ' B ') # FIG.AUTOFMT_XD Ate () # plt.show () # Series14.plot (style= ' B ') # fig.autofmt_xdate () # plt.show () # Series15.plot (style= ' B ') # fig.au
	Tofmt_xdate () # plt.show () M11 = Np.array (list11). Mean () M12 = Np.array (list12). Mean () M13 = Np.array (list13). Mean () M14 = Np.array (list14). MeaN () M15 = Np.array (list15). Mean () Meantemps = [M11, M12, M13, M14, M15] m11h = Np.array (Converttoint (series11h)). Mean ()  -M11 m12h = Np.array (Converttoint (series12h)). Mean ()-M12 m13h = Np.array (Converttoint (series13h)). Mean ()-M13 m14h = Np.array (Converttoint (series14h)). Mean ()-M14 m15h = Np.array (Converttoint (series15h)). Mean ()-M15 meantemphs = [M1 1h, m12h, m13h, m14h, m15h] std11 = Np.array (list11). STD () std12 = Np.array (list12). STD () std13 = Np.array (list13). STD ( ) Std14 = Np.array (list14). STD () std15 = Np.array (list15). STD () stdtemps = [Std11, std12, Std13, Std14, Std15] std11h = Np.array (Converttoint (series11h)). STD () std12h = Np.array (Converttoint (series12h)). STD () std13h = Np.array ( Converttoint (SERIES13H)). STD () std14h = Np.array (Converttoint (series14h)). STD () std15h = Np.array (Converttoint ( series15h)). STD () stdtemphs = [std11h, std12h, Std13h, std14h, std15h] ind = Np.arange (5) width = 0.35 P1 = Plt.bar (in D, Meantemps, width, color= ' R ', yerr=stdtemps) P2 = Plt.bar (Ind, meantemphs, Width, color= ' y ', Bottom=meantemps, yerr=stdtemphs) Plt.ylabel (' Tempature ') plt.title (' mean of mintempature and mean of maxtempature in past 5 years ') Plt.xticks (Ind + WIDTH/2., (' 2011 ', ' 2012 ') , ' 2013 ', ' 2014 ', ' 2015 ') Plt.legend ((p1[0), p2[0]), (' Mintempature ', ' delttempature ')) Plt.show ()



Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.