1 ImportRequests,json,urllib.parse2 ImportThreading3Threading_lock=threading. Boundedsemaphore (value=10)#Set Maximum thread4 5 defget_page (URL):6Page=requests.get (URL)7Page=page.content8Page=page.decode ('Utf-8')9 returnpageTen One defPages_from_duitang (label): Apages=[] -URL ='https://www.duitang.com/napi/blog/list/by_search/?kw={}&type=feed&start={}&limit=100' -Label=urllib.parse.quote (label) the forIndexinchRange (0,3000,100): -Page_url=Url.format (Label,index) - Print(Page_url) -Page=get_page (Page_url) + pages.append (page) - returnpages + A at - deffindall_page (page,startpaet,endstart): -all_strat=[] -End=0 - whilePage.find (startpaet,end)! =-1: -Start=page.find (startpaet,end) +Len (STARTPAET) inEnd=Page.find (Endstart,start) -string=Page[start:end] to all_strat.append (String) + returnAll_strat - the defpic_urls_from_pages (pages): *pic_urls=[] $ forPageinchPages:Panax NotoginsengUrls=findall_page (page,'"path": "','"') - pic_urls.extend (URLs) the + returnPic_urls A the defdownload_pics (url,name): +R=requests.get (URL) -Path=r'C:\Users\Administrator\Desktop\ crawl heap sugar picture \pics\\'+STR (name) +'. jpg' $With open (path,'WB') as F: $ f.write (r.content) - threading_lock.release () - defMain (label): thepages=Pages_from_duitang (label) -pic_urls=pic_urls_from_pages (pages)WuyiName=0 the forUrlinchPic_urls: -Name+=1 Wu Threading_lock.acquire () - Print('downloading the {} picture'. Format (name)) AboutT=threading. Thread (target=download_pics,args=(url,name)) $ T.start () - #download_pics (url,name) - - A +Main'Emoticons Pack')
Python crawl heap Sugar pictures