This case is about capturing the loaded data from JavaScript. For more information, see: Python Learning Guide
#-*-coding:utf-8-*-ImportRequestsImportReImportTimeImportJson#数据下载器classHtmldownloader (Object):defDownload Self, URL, params=None):ifUrl is None:return NoneUser_agent= ' mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) gecko/20100101 firefox/48.0 'Headers={' User-agent ': User_agent}ifParams is None: R=Requests.get (URL, headers=HeadersElse: R=Requests.get (URL, headers=Headers, params=ParamsifR.status_code== $: r.encoding= ' Utf-8 ' returnR.textreturn None#数据存储器classHtmlparser (Object):#从选购电影页面中解析出所有电影信息, make up a list defParser_url ( Self, Page_url, response): pattern=Re.Compile(R ' (http://movie.mtime.com/(\d+)/) ') URLs=Pattern.findall (response)ifURLs!= None:#将urls去重 return List(Set(URLs))Else:return None #解析正在上映的电影 def__parser_release ( Self, Page_url, value):'''analyze the films that have been shown:p Aram Page_url: Movie Link:p Aram Value:json Data: Return ''' Try: Isrelease= 1Movierating=Value.get (' value '). Get (' movierating ') boxoffice=Value.get (' value '). Get (' Boxoffice ') Movietitle=Value.get (' value '). Get (' Movietitle ') rpicturefinal=Movierating.get (' rpicturefinal ') rstoryfinal=Movierating.get (' rstoryfinal ') rdirectorfinal=Movierating.get (' rdirectorfinal ') rotherfinal=Movierating.get (' rotherfinal ') ratingfinal=Movierating.get (' ratingfinal ') MovieID=Movierating.get ("MovieID") UserCount=Movierating.get ("UserCount") Attitudecount=Movierating.get ("Attitudecount") Totalboxoffice=Boxoffice.get ("Totalboxoffice") Totalboxofficeunit=Boxoffice.get ("Totalboxofficeunit") Todayboxoffice=Boxoffice.get ("Todayboxoffice") Todayboxofficeunit=Boxoffice.get ("Todayboxofficeunit") showdays=Boxoffice.get (' Showdays ')Try: Rank=Boxoffice.get (' Rank ')except Exception, E:rank= 0 #返回所提取的内容 return(MovieID, Movietitle, Ratingfinal, Rotherfinal, Rpicturefinal, Rdirectorfinal, Rstoryfinal, UserCount, AttitudeCount, Totalboxoffice+Totalboxofficeunit, Todayboxoffice+Todayboxofficeunit, Rank, Showdays, Isrelease)except ExceptionE:PrintE, Page_url, valuereturn None #解析未上映的电影 def__parser_no_release ( Self, Page_url, value, isrelease=0):'''Parse non-screened movie information:p Aram Page_url:p Aram Value: Return ''' Try: movierating=Value.get (' value '). Get (' movierating ') Movietitle=Value.get (' value '). Get (' Movietitle ') rpicturefinal=Movierating.get (' rpicturefinal ') rstoryfinal=Movierating.get (' rstoryfinal ') rdirectorfinal=Movierating.get (' rdirectorfinal ') rotherfinal=Movierating.get (' rotherfinal ') ratingfinal=Movierating.get (' ratingfinal ') MovieID=Movierating.get ("MovieID") UserCount=Movierating.get ("UserCount") Attitudecount=Movierating.get ("Attitudecount")Try: Rank=Value.get (' value '). Get (' Hotvalue '). Get (' Ranking ')except Exception, E:rank= 0 #返回所提取的内容 return(MovieID, Movietitle, Ratingfinal, Rotherfinal, Rpicturefinal, Rdirectorfinal, Rstoryfinal, UserCount, AttitudeCount,u ' None ',u ' None ', Rank,0, Isrelease)except ExceptionE:PrintE, Page_url, valuereturn None #解析电影中的json信息 defParser_json ( Self, Page_url, Response):"""Parse Response:p Aram Response: Return """ #将 "=" and ";" To extract the content betweenPattern=Re.Compile(R ' = (. *?); ') result=Pattern.findall (response) [0]ifResult!= None:#json模块加载字符串Value=Json.loads (Result)# Print (Result) Try: Isrelease=Value.get (' value '). Get (' Isrelease ')except ExceptionE:PrintEreturn None ifIsrelease:'''isrelease:0 A film that will not be shown for a long time; 1 films already on show; 2 upcoming movies ''' ifValue.get (' value '). Get (' Hotvalue ')== None:#解析正在上映的电影 # Print (Self.__parser_release (page_url, value)) return Self. __parser_release (Page_url, value)Else:#解析即将上映的电影 # Print (Self.__parser_no_release (page_url, value, Isrelease = 2)) return Self. __parser_no_release (Page_url, value, isrelease= 2)Else:#解析还有很长时间才能上映的电影 return Self. __parser_no_release (Page_url, value)#数据存储器#数据存储器将返回的数据插入mysql数据库中, mainly including the establishment of tables, inserting and shutting down the database operations, the table set 15 fields to store the movie information, the code is as follows:#这里以后补充classSpidermain (Object):def __init__( Self): Self. Downloader=Htmldownloader () Self. parser=Htmlparser ()defCrawl Self, Root_url): Content= Self. Downloader.download (Root_url) URLs= Self. Parser.parser_url (Root_url, content)#构造一个活的评分和票房链接 forUrlinchURLsTry: t=Time.strftime ("%y%m%d%h%m%s3282 ", Time.localtime ()) param={' Ajax_callback ':' true ',' Ajax_callbacktype ':' Mtime.Library.Services ',' Ajax_callbackmethod ':' getmovieoverviewrating ',' ajax_callbackargument0 ':'%s'%(url[1]),' Ajax_requesturl ':'%s'%(url[0]),' Ajax_crossdomain ':' 1 ',' t ':'%s'%T} Rank_url= ' Http://service.library.mtime.com/Movie.api? 'Rank_content= Self. Downloader.download (Rank_url, param) data= Self. Parser.parser_json (Rank_url, rank_content) Self. Output.output_end ()except ExceptionE:Print("Crawl failed")if __name__ == ' __main__ ': Spier=Spidermain () Spier.crawl (' http://theater.mtime.com/China_Jiangsu_Province_Nanjing/')
Reference:
Crawl Time Network film review
Python Crawler (20) _ Dynamic Crawl Movie review information