Import osimport reimport requestsdef get_urls (Url, regex): urls = [] base_url = ' http://desk.zol.com.cn ' content = requests.get (URL). Content area = re.search (Regex, content, re. S). Group (0) tails = re.findall (R ' href= "(. *?)" ', area) for tail in tails: urls.append (Base_url + tail) return urlsdef download_picture (url, count): target_dir = ' pic ' if os.path.exists (target_dir): if not os.path.isdir (Target_dir): os.remove (Target_dir) else: &nbSp; os.mkdir (Target_dir) content = requests.get ( URL). Content picture_url = re.search (R ' .*?</ul> Regex2 = r ' <ul id= "showimg" .*?</ul> ' urls = get_urls ( URL,&NBSP;REGEX1) for each_url in urls: pictuRe_urls = get_urls (EACH_URL,&NBSP;REGEX2) for each_picture_url in picture_urls: download_picture (Each_picture_url, count) print ' downloading picture ' + str (count) count += 1 return Countdef get_next_page_url (URL): base_url = ' http://desk.zol.com.cn ' content = requests.get (URL) .content tail = Re.search (R ' <a id= "Pagenext" href= "(. *?)" ', content). Group (1) return base_url + tailif __name__ == ' __main__ ': url = ' Http://desk.zol.com.cn/meinv/' count = 1 count = spider (url, count ) while true: key = raw_ Input (' Input y/y to continue download next page, or input other words to exit. ') if re.match (R ' Y ', key, re. I): url = get_next_page_url ( URL) count = spider (URL, count) else: exit ()
This article is from the "Last Night Stars" blog, please make sure to keep this source http://yestreenstars.blog.51cto.com/1836303/1664227
Crawler instances--mainly using the RE and requests modules