This article mainly uses BeautifulSoup to crawl the top film information in watercress. The software environment is based on anaconda3+python3.5
The specific code looks like this:
#-*-Coding:utf-8-*-"" "Created on Sun Mar 21:37:18 2017 @author: Zch" "" Import urllib.request Import re fr Om BS4 import beautifulsoup import codecs #爬虫函数 def crawl (URL): page = urllib.request.urlopen (URL) con
Tents = Page.read () soup = beautifulsoup (contents, "Html.parser") Print (U ' watercress movie 250: Serial number \ t the movie name \ t score \ t rated number ') Infofile.write (u "watercress movie 250: Serial number \ t film name \ t score \ t rated number \ r \ n") Print (U ' crawl info as follows: \ n ') for tag in Soup.find_all (attrs={
' Class ': ' Item '}: #print tag #爬取序号 num = tag.find (' em '). Get_text () print (num)
#爬取电影名称 name = Tag.find (attrs={"class": "HD"}). A.get_text () name = Name.replace (' \ n ', ') Print (name) infofile.write (num+ "" +name+ "\ r \ n") #电影名称 title = Tag.find_all (attrs={"C" Lass ":" title "}) i = 0 for n in title:text = N.get_text () Text = Text.repl
Ace ('/', ') Text = Text.lstrip () If I==0:print (U ' [Chinese title] ', text) infofile.write (u "[Chinese title]" + text + "\ r \ n") elif i==1:print (U ' [English title] ', text) INFOFILE.WR ITE (U "[English title]" + text + "\ r \ n") i = i + 1 #爬取评分和评论数 info = tag.find (attrs={"class": "Star" ). Get_text () info = info.replace (' \ n ', ' ') info = Info.lstrip () print (info) mode = Re.compile (R ' \d+\.?
\d* ') print (Mode.findall (info)) i = 0 for n in Mode.findall (info): if i==0:
Print (U ' [fractions] ', n) infofile.write (U "[score]" + n + "\ r \ Nand") Elif I==1: Print (U ' [comments] ', n) infofile.write (U "[comments]" + n + "\ r \ Nand") i = i + 1 #获 Comment info = tag.find (attrs={"class": "Inq"}) if (info): # 132 movies [Lost Lovers] no film review content = inf O.Get_text () Print (U ' [film review] ', content) Infofile.write (u "[film critic]" + content + "\ r \ n") print ( ' #主函数 if __name__ = = ' __main__ ': Infofile = Codecs.open ("Result_douban.txt", ' a ', ' utf-8 ') URL = ' Http://movie.douban.com/top250?format=text ' i = 0 while i<10:print (U ' page number ', (i+1)) n
um = i*25 #每次显示25部 URL serial number is increased by 25 url = ' https://movie.douban.com/top250?start= ' + str (num) + ' &filter= ' Crawl (URL) infofile.write ("\r\n\r\n\r\n") i = i + 1 infofile.close ()
The
Crawled Watercress movie information is stored in a text file for subsequent analysis, as shown in the following illustration: