Directly on the code, first to crawl the watercress image, the general idea is to send the request-get response data-storage data, the principle can first look at this
Https://www.cnblogs.com/sss4/p/7809821.html
ImportOs#Create a folder with meImportRequests#to send a request and get a response fromBs4ImportBeautifulSoup#data used to parse the responsedefGethtmltext (URL):#Get Response Data Try: R= Requests.get (URL)#Send URLR.raise_for_status ()#Judging whether it is successfulR.encoding ='Utf-8'#set the encoding format returnR.text#back to his response data except: return "'defMain (pages): filepath=OS.GETCWD () +'\ climb the picture \ \'#Create a folder if notOs.path.exists (filepath):#if not, then createos.makedirs (filepath) pagenum=pages#Number of pages to crawlFnum=1 forPageinchRange (pages): URL="https://movie.douban.com/celebrity/1048000/photos/?type=C&start="+str (page*30) +'&sortby=like&size=a&subtype=a'#The first few pagesHtml=gethtmltext (URL) soup=beautifulsoup (HTML,'Html.parser')#HTML. Parser is the parserUls=soup.find_all ('ul', class_="poster-col3 Clearfix")#Find the UL class is XXXX data from the response data forUlinchUls:imgs=ul.find_all ('img')#Find the IMG tag forImginchImgs:imgurl=img['src']#get the URL of imgImgcontent=requests.get (imgurl). Content#get the content under this URL, it should be binaryFilename=str (Fnum) +'. jpg'With open (filepath+filename,'WB') as WF:#write data in binary formwf.write (imgcontent) Fnum+=1if __name__=='__main__': Main (9)
One more crawl to the heading class.
ImportRequests fromBs4ImportBeautifulsoupurl="http://www.jianshu.com"Headers={'user-agent':'SE 2.X METASR 1.0'}#set the user-agent of the request header, which can be thought of from which browser, otherwise it will be reversedPage=requests.get (url=url,headers=headers) Page_info=PAGE.TEXTPAGE_BF=beautifulsoup (Page_info,'Html.parser')#print (Page_bf.prettify ())Titles=page_bf.find_all ('a','title') forTitleinchtitles:Print(title.string)Print('http://www.jianshu.com'+title.get ('href') ) with open (R"D:\untitled\ crawler crawling to the title. txt","W", encoding='Utf-8') as file: forTitleinchTitles:file.write (title.string+'\ n') File.write ("http://www.jianshu.com"+title.get ('href')+'\ n')
This is the---to download the novel (someone else's code)
fromBs4ImportBeautifulSoupImportRequests,sysclassDownloader (object):def __init__(self): Self.server="http://www.biqukan.com/"Self.target="http://www.biqukan.com/1_1094"Self.name=[] Self.urls=[] self.nums=0defGet_download_url (self): req=requests.get (url=self.target) HTML=Req.text DIV_BF=beautifulsoup (HTML) div=div_bf.find_all ('Div', class_='Listmain') A_BF=BeautifulSoup (str (div[0)) a=a_bf.find_all ('a') Self.nums=len (a[15:]) foreachinchA[15:]: Self.name.append (each.string) self.urls.append (Self.server+each.get ('href')) defget_contents (self, target): Req=requests.get (url=target) HTML=Req.text BF=BeautifulSoup (HTML) Texts=bf.find_all ('Div', class_='Showtxt') Texts=texts[0].text.replace ('\xa0'*8,'\ n') returnTextsdefwriter (self,name,path,text): Write_flag=True with open (path,"a", encoding='Utf-8') as F:f.write (name+'\ n') f.writelines (text) f.write ('\ n') DL=Downloader () Dl.get_download_url ( )Print("Start Download") forIinchRange (dl.nums): Dl.writer (Dl.name[i],'One read Eternity .', Dl.get_contents (dl.urls[i)) Sys.stdout.write ("downloaded:%.3f%%"% Float (i/dl.nums) +'\ r') Sys.stdout.flush ()Print('"One year Eternity" Download complete')
Python3 Crawler-QuickStart-crawling pictures and titles