Goal, watercress Reading,
Download page book pictures.
Import urllib.request Import re #使用正则表达式def getjpg (date): Jpglist = Re.findall (R ' (img src= "http.+?. JPG ") ([\s\s]*?) (.+?. alt= ". +?.") ', date ' return jpglistdef downLoad (jpgurl,stitle,n): Try:urllib.request.urlretrieve (Jpgurl, ' C:\\users\\74172\\source\\repos\\python\\spidertest1\\images\\book.douban\\%s.jpg '%sTitle) except Exception As E:print (e) finally:print (' picture%s download operation completed '% n ') def getTitle (date): Titlelist = Re.findall (R ' Ti Tle= "." > ', Date ' return titlelistif __name__ = = ' __main__ ': url = ' https://book.douban.com/' res = urllib.request . Urlopen (URL) date = Res.read (). Decode (' utf-8 ') date_jpg = getjpg (date) Imagetitle = GetTitle (date) Global n n = 1 for jpginfo in date_jpg:s = Re.findall (R ' http.+?. JPG ', str (jpginfo)) print (n, '---url--', str (s) [2:-2]) Stitleinfo = Re.findall (R ' alt= ". +?." ', str (jpginfo) ) Stitlel = Re.findalL (R ' ". +?." ', str (stitleinfo)) Stitle = str (stitlel) [3:-3] DownLoad (s[0],stitle,n) n = n + 1
made some changes and wrote the title to the TXT file
Import urllib.request Import re #使用正则表达式def getjpg (html): Jpglist = Re.findall (R ' (img src= "http.+?. JPG ") ([\s\s]*?) (.+?. alt= ". +?.") ', html) jpglist = Re.findall (R ' http.+?. JPG ', str (jpglist)) return jpglistdef downLoad (jpgurl,stitle,n): Try:urllib.request.urlretrieve (Jpgurl, ' C:/users/74172/source/repos/python/spidertest1/images/book.douban/%s.jpg '%stitle) finally:print (' figure Slice---%s----download operation completed '% Stitle ' def getTitle (html): Titlelist = Re.findall (R ' (img src= "http.+?. JPG ") ([\s\s]*?) (.+?. alt= ". +?.") ', html) titlelist = Re.findall (R ' alt= ". +?." ', str (titlelist)) Titlelist = Re.findall (R ' ". +?." '), str (Titleli ST)) return titlelistdef writetxt (imagetitle): try: #目录建立txt文件 f = open ((url[8:-5]+ '. txt '), "a", Encodin g= "Utf-8") #写入 f.write (imagetitle+ ' \ n ') finally:if F: #关闭文件 f.close () If _ _name__ = = ' __main__ ': url = ' https://book.douban.com/' res = urllib.reQuest.urlopen (URL) html = Res.read (). Decode (' utf-8 ') Urljpgs = getjpg (html) imagetitle = getTitle (HTML) n = 0 for Urljpg in Urljpgs:print (n, '---url--', urljpg) downLoad (urljpg,imagetit Le[n][1:-1],n) Writetxt (imagetitle[n][1:-1]) n = n + 1
Python crawler. 3. Download Web Images