標籤:
最近發現 吾志 上使用者的頭像都很個性,另外,對於沒有把日記設為私密的使用者,當天的日記是公開的,誰都可以查看。
所以,如果每天把所有可查看的日記爬一遍,那麼~~ 哈哈
我以前對爬蟲只是瞭解一點點,沒有真的玩過。既然今晚興緻來了,那就隨便學一下咯~
參考 http://cuiqingcai.com/1052.html
1 #coding=utf-8 2 import os 3 import urllib 4 import urllib2 5 import re 6 import cookielib 7 8 9 10 def mkdir(path):11 # 去除左右兩邊的空格12 path = path.strip()13 # 去除尾部 \ 符號14 path = path.rstrip("\\")15 16 if not os.path.exists(path):17 os.makedirs(path)18 19 return path20 21 22 def save_file(path, file_name, data):23 if data == None:24 return25 26 mkdir(path)27 if (not path.endswith("/")):28 path = path + "/"29 f = open(path+file_name, "wb")30 f.write(data)31 f.flush()32 f.close()33 34 35 36 user_agent = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36‘37 headers = {‘User-Agent‘ : user_agent}38 values = {}39 data = urllib.urlencode(values)40 41 def getHtml(url):42 req = urllib2.Request(url, data, headers)43 page = urllib2.urlopen(req, timeout=10)44 html = page.read()45 page.close()46 #print html47 return html48 49 def get_file(url):50 try:51 opener = urllib2.build_opener()52 opener.addheaders = [(‘User-Agent‘, ‘Mozilla/5.0‘)]53 urllib2.install_opener(opener)54 req = urllib2.Request(url)55 operate = opener.open(req)56 data = operate.read()57 operate.close()58 return data59 except BaseException, e:60 print e, ‘fuck‘61 return None62 63 64 def getImg(html):65 reg = r‘src="(.+?\.jpg)" alt=‘66 imgre = re.compile(reg)67 imglist = re.findall(imgre, html)68 69 x = 070 for imgurl in imglist:71 #urllib.urlretrieve(imgurl, ‘%s.jpg‘ % x)72 da = get_file(imgurl)73 save_file(‘.‘, ‘%s.jpg‘ % x, da)74 x += 175 76 return x77 78 79 80 html = getHtml("https://wuzhi.me/last")81 82 print getImg(html)
十分簡陋,哈哈~
python 爬圖 helloworld