python這個語言用著真是舒服,文法簡單,代碼非常簡短,用了一個中午就看完了一本入門的書,於是網上找了點資料,模仿著寫了個網路爬蟲,以前用c寫過,代碼量估計是這個的10倍。順便也瞭解了下Regex。
Regex截了幾個圖,這幾個圖看了,大概就能看懂簡單的了。
#filename:downloadpage.pyimport urllib2import re#下載網頁def downURL(url,filename): try: fp=urllib2.urlopen(url) except: print 'download exception' return 0 op=open("f:\\"+filename,"wb")#放f盤了 while 1: s=fp.read() if not s: break op.write(s) fp.close() op.close() return 1 #Regex提取urldef getURL(url): try: fp=urllib2.urlopen(url) except: print 'get url exception' return [] pattern=re.compile("http://news.sina.com.cn/[^\>]+.shtml") while 1: s=fp.read() if not s: break urls=pattern.findall(s) fp.close() return urls #廣度優先搜尋def spider(startURL,times): urls=[] urls.append(startURL) i=0 while 1: if i>times: break if len(urls)>0: url=urls.pop(0) print url,i downURL(url,str(i)+'.html') i=i+1 if len(urls)<times: urllist=getURL(url) for url in urllist: if urls.count(url)==0: urls.append(url) else: break return 1s=int(raw_input('輸入需要下載的網頁數量:'))spider("http://news.sina.com.cn",s)