Copy Code code as follows:
Import Urllib.request
Import re
Import time
def movie (Movietag):
Tagurl=urllib.request.urlopen (URL)
Tagurl_read = Tagurl.read (). Decode (' Utf-8 ')
Return Tagurl_read
def subject (Tagurl_read):
'''
Here's the problem:
① This is only sorted on a single page, but not on all the pages of the movie.
② Next update Add movie link, consider adding movie poster
③ need to append list
④ Import into local txt or Excel
⑤ can match links and names, ratings, comments and arrays when matching movie names
⑥
'''
#正则表达式匹配电影的名字 (links), ratings and comments
Nameurl = Re.findall (R ' (http://movie.douban.com/subject/[0-9.) +) \/"\s+title=" (. +) "', Tagurl_read)
Scoreurl = Re.findall (R ' <span\s+class= "Rating_nums" > ([0-9.] +) <\/span> ', Tagurl_read
Evaluateurl = Re.findall (R ' <span\s+class= "PL" >\ ((\w+) People evaluation \) <\/span> ', Tagurl_read)
movielists = List (Zip (Nameurl,scoreurl,evaluateurl))
Newlist.extend (movielists)
Return NewList
#用quote处理特殊 (Chinese) character
Movie_type = urllib.request.quote (Input (' Please enter movie type (such as plot, comedy, suspense) ")
Page_end=int (' Please enter the page number at the end of the search: ')
Num_end=page_end*20
Num=0
Page_num=1
Newlist=[]
While Num<num_end:
Url=r ' http://movie.douban.com/tag/%s?start=%d '% (movie_type,num)
Movie_url = Movie (URL)
Subject_url=subject (Movie_url)
Num=page_num*20
Page_num+=1
Else
#使用sorted函数对列表进行排列, the reverse argument is true in ascending order, the default or False is descending, Key=lambda doesn't quite understand the principle here
Movielist = sorted (newlist, Key=lambda movielist:movielist[1],reverse = True)
For movie in Movielist:
Print (movie)
Time.sleep (3)
Print (' End ')