The code is as follows:
Import Urllib.request
Import re
Import time
def movie (Movietag):
Tagurl=urllib.request.urlopen (URL)
Tagurl_read = Tagurl.read (). Decode (' Utf-8 ')
Return Tagurl_read
def subject (Tagurl_read):
'
There is still a problem:
① This is sorted for a single page, not a movie for all pages
② next update Add movie link, consider adding movie poster
③ need append list
④ import into local txt or Excel
⑤ matches the name of the movie, the name, the rating, the comment form array
⑥
'
#正则表达式匹配电影的名字 (link), rating and comment
Nameurl = Re.findall (R ' (HTTP/ Movie.douban.com/subject/[0-9.] +) \/"\s+title=" (. +) "', tagurl_read)
Scoreurl = Re.findall (R ' [0-9.] +) <\/span> ', tagurl_read)
Evaluateurl = Re.findall (R ' \ (\w+) person reviews \) <\/span> ', tag Url_read)
Movielists = List (Zip (nameurl,scoreurl,evaluateurl))
Newlist.extend (movielists)
return n ewlist
#用quote处理特殊 (Chinese) characters
Movie_type = urllib.request.quote (Input (' Please enter a movie type (such as plot, comedy, suspense): ')
Page_end=int (Input (' Please enter page number at end of search: '))
Num_end=page_end*20
Num=0
Page_num=1
Newlist=[]
While Num<>
Url=r ' http://movie.douban.com/tag/%s?start=%d '% (movie_type,num)
Movie_url = Movie (URL)
Subject_url=subject (Movie_url)
Num=page_num*20
Page_num+=1
Else
#使用sorted函数对列表进行排列, reverse parameter is true when ascending, default or False is descending, Key=lambda is not quite clear about the principle here
Movielist = sorted (newlist, Key=lambda movielist:movielist[1],reverse = True)
For movie in Movielist:
Print (movie)
Time.sleep (3)
Print (' End ')