Xpath crawls Sina weather and xpath crawls Sina
References:
Http://cuiqingcai.com/1052.html
Http://cuiqingcai.com/2621.html
Http://www.cnblogs.com/jixin/p/5131040.html
Complete code:
1 #-*-coding: UTF-8-*-2 import urllib2 3 from lxml import etree 4 user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0 "5 headers = {'user-agent': user_agent} 6 url = 'HTTP: // weather.sina.com.cn/'7 user_input = raw_input ("Enter the Pinyin of the city in which you want to query the weather, such as beijing \ n ") 8 # print user_input 9 url = url + user_input10 # print url11 req = urllib2.Request (url, headers = headers) 12 reponse = urllib2.urlopen (req) 13 text = reponse. read () 14 # print html15 # print type (text) 16 html = etree. HTML (text) 17 # print html18 # result = etree. tostring (html) 19 # print result20 # Sometimes there are only 19 icons and times data for the day, and 21 def change_list (lis) are processed separately ): 22 new_lis = [] 23 if len (lis) = if lis = icons: 25 new_lis.append (lis [0]) 26 for I in range (, 2 ): 27 new_lis.append (lis [I] + U' '+ lis [I + 1]) 28 elif lis = times: 29 new_lis.append (lis [0]. text) 30 for I in range (1, 19, 2): 31 new_lis.append (lis [I]. text + U' to '+ lis [I + 1]. text) 32 elif len (lis) = 20:33 if lis = icons: 34 for I in range (20, 2 ): 35 new_lis.append (lis [I] + U' '+ lis [I + 1]) 36 elif lis = times: 37 for I in range (20, 2 ): 38 new_lis.append (lis [I]. text + U' to '+ lis [I + 1]. text) 39 return new_lis40 note1 = html. xpath ('// * [@ class = "wt_tt0_note"]') 41 note2 = html. xpath ('// * [@ class = "wt_tt0_note"]/.. ') 42 # print note [0]. text43 dates = html. xpath ('// * [@ class = "wt_fc_c0_ I _date"]') 44 days = html. xpath ('// * [@ class = "wt_fc_c0_ I _date"]/following-sibling: * [1]') 45 icons = html. xpath ('// * [@ class = "wt_fc_c0_ I _icons clearfix"]/img/@ alt') 46 # print len (icons) 47 icons = change_list (icons) 48 times = html. xpath ('// * [@ class = "wt_fc_c0_ I _times"]/span') 49 times = change_list (times) 50 temps = html. xpath ('// * [@ class = "wt_fc_c0_ I _temp"]') 51 tips = html. xpath ('// * [@ class = "wt_fc_c0_ I _tip"]') 52 ls = html. xpath ('// * [@ class = "l"]') 53 rs = html. xpath ('// * [@ class = "r"]') 54 print note1 [0]. text, note2 [0]. text55 # print len (ls), len (rs) 56 # PM2.5 and air quality only have 7 pieces of data 57 for I in range (7): 58 print dates [I]. text, days [I]. text, times [I], icons [I], temps [I]. text, tips [I]. text, u'pm2. 5: '+ ls [I]. text, u'air quality: '+ rs [I]. text59 for I in range (7, 10): 60 print dates [I]. text, days [I]. text, times [I], icons [I], temps [I]. text, tips [I]. text