Import requestsfrom lxml import etreefrom urllib import requestimport reimport osimport datetimedef parse_page (URL): domain = "http://www.budejie.com" header = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; linux x86_64; RV: 63.0) Gecko/20100101 Firefox/63.0 "} response = requests. get (URL, headers = header ). text html = etree. HTML (response) links = html. XPath ("// Div [@ class = 'J-r-list-C-IMG '] // A/@ href") img_de Tail_urls = map (lambda URL: domain + URL, links) # print (img_detail_urls) return img_detail_urlsdef get_img (URL): Header = {"User-Agent ": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; RV: 63.0) Gecko/20100101 Firefox/63.0"} response = requests. get (URL, headers = header ). text html = etree. HTML (response) img_url = html. XPath ("// Div [@ class = 'J-r-list-C-IMG '] // img/@ SRC") [0] # print (img_url) img_title = html. XPath ("/ /Div [@ class = 'J-r-list-C-IMG '] // img/@ title ") [0] img_title = Re. sub (R '[\?? \.,.! 1 ~ [] \/] ', '', Img_title) Suffix = OS. path. splitext (img_url) [1] # print (img_title) file_name = img_title.strip () + suffix request. urlretrieve (img_url, '/home/Yuyang/pycharmprojects/py3_spider/image/' + file_name) print ("Download image :{}". format (file_name) delta_time = 0def main (): Global delta_time for X in range (5, 6): Start = datetime. datetime. now () base_url = "http://www.budejie.com/pic {}". format (X) print ("==================== start to download page {} ====================== ===== ". format (x) img_urls = parse_page (base_url) for img_url in img_urls: # print (img_url) get_img (img_url) Delta = (datetime. datetime. now ()-Start ). seconds delta_time + = delta print ("==================== time consumed on page {} ====== ============ ". format (x, Delta) print ("=================== download complete, total time consumed {} seconds ==================== ". format (delta_time) If _ name _ = '_ main _': Main ()
Best friend picture download