Python Crawler (20) _ Dynamic Crawl Movie review information

Source: Internet
Author: User

This case is about capturing the loaded data from JavaScript. For more information, see: Python Learning Guide

#-*-coding:utf-8-*-ImportRequestsImportReImportTimeImportJson#数据下载器classHtmldownloader (Object):defDownload Self, URL, params=None):ifUrl is None:return NoneUser_agent= ' mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) gecko/20100101 firefox/48.0 'Headers={' User-agent ': User_agent}ifParams is None: R=Requests.get (URL, headers=HeadersElse: R=Requests.get (URL, headers=Headers, params=ParamsifR.status_code==  $: r.encoding= ' Utf-8 '            returnR.textreturn None#数据存储器classHtmlparser (Object):#从选购电影页面中解析出所有电影信息, make up a list    defParser_url ( Self, Page_url, response): pattern=Re.Compile(R ' (http://movie.mtime.com/(\d+)/) ') URLs=Pattern.findall (response)ifURLs!= None:#将urls去重            return List(Set(URLs))Else:return None    #解析正在上映的电影    def__parser_release ( Self, Page_url, value):'''analyze the films that have been shown:p Aram Page_url: Movie Link:p Aram Value:json Data: Return        '''        Try: Isrelease= 1Movierating=Value.get (' value '). Get (' movierating ') boxoffice=Value.get (' value '). Get (' Boxoffice ') Movietitle=Value.get (' value '). Get (' Movietitle ') rpicturefinal=Movierating.get (' rpicturefinal ') rstoryfinal=Movierating.get (' rstoryfinal ') rdirectorfinal=Movierating.get (' rdirectorfinal ') rotherfinal=Movierating.get (' rotherfinal ') ratingfinal=Movierating.get (' ratingfinal ') MovieID=Movierating.get ("MovieID") UserCount=Movierating.get ("UserCount") Attitudecount=Movierating.get ("Attitudecount") Totalboxoffice=Boxoffice.get ("Totalboxoffice") Totalboxofficeunit=Boxoffice.get ("Totalboxofficeunit") Todayboxoffice=Boxoffice.get ("Todayboxoffice") Todayboxofficeunit=Boxoffice.get ("Todayboxofficeunit") showdays=Boxoffice.get (' Showdays ')Try: Rank=Boxoffice.get (' Rank ')except Exception, E:rank= 0            #返回所提取的内容            return(MovieID, Movietitle, Ratingfinal, Rotherfinal, Rpicturefinal, Rdirectorfinal, Rstoryfinal, UserCount, AttitudeCount, Totalboxoffice+Totalboxofficeunit, Todayboxoffice+Todayboxofficeunit, Rank, Showdays, Isrelease)except ExceptionE:PrintE, Page_url, valuereturn None    #解析未上映的电影    def__parser_no_release ( Self, Page_url, value, isrelease=0):'''Parse non-screened movie information:p Aram Page_url:p Aram Value: Return        '''        Try: movierating=Value.get (' value '). Get (' movierating ') Movietitle=Value.get (' value '). Get (' Movietitle ') rpicturefinal=Movierating.get (' rpicturefinal ') rstoryfinal=Movierating.get (' rstoryfinal ') rdirectorfinal=Movierating.get (' rdirectorfinal ') rotherfinal=Movierating.get (' rotherfinal ') ratingfinal=Movierating.get (' ratingfinal ') MovieID=Movierating.get ("MovieID") UserCount=Movierating.get ("UserCount") Attitudecount=Movierating.get ("Attitudecount")Try: Rank=Value.get (' value '). Get (' Hotvalue '). Get (' Ranking ')except Exception, E:rank= 0            #返回所提取的内容            return(MovieID, Movietitle, Ratingfinal, Rotherfinal, Rpicturefinal, Rdirectorfinal, Rstoryfinal, UserCount, AttitudeCount,u ' None ',u ' None ', Rank,0, Isrelease)except ExceptionE:PrintE, Page_url, valuereturn None    #解析电影中的json信息    defParser_json ( Self, Page_url, Response):"""Parse Response:p Aram Response: Return        """        #将 "=" and ";" To extract the content betweenPattern=Re.Compile(R ' = (. *?); ') result=Pattern.findall (response) [0]ifResult!= None:#json模块加载字符串Value=Json.loads (Result)# Print (Result)            Try: Isrelease=Value.get (' value '). Get (' Isrelease ')except ExceptionE:PrintEreturn None            ifIsrelease:'''isrelease:0 A film that will not be shown for a long time; 1 films already on show; 2 upcoming movies                '''                ifValue.get (' value '). Get (' Hotvalue ')== None:#解析正在上映的电影                    # Print (Self.__parser_release (page_url, value))                    return  Self. __parser_release (Page_url, value)Else:#解析即将上映的电影                    # Print (Self.__parser_no_release (page_url, value, Isrelease = 2))                    return  Self. __parser_no_release (Page_url, value, isrelease= 2)Else:#解析还有很长时间才能上映的电影                return  Self. __parser_no_release (Page_url, value)#数据存储器#数据存储器将返回的数据插入mysql数据库中, mainly including the establishment of tables, inserting and shutting down the database operations, the table set 15 fields to store the movie information, the code is as follows:#这里以后补充classSpidermain (Object):def __init__( Self): Self. Downloader=Htmldownloader () Self. parser=Htmlparser ()defCrawl Self, Root_url): Content=  Self. Downloader.download (Root_url) URLs=  Self. Parser.parser_url (Root_url, content)#构造一个活的评分和票房链接         forUrlinchURLsTry: t=Time.strftime ("%y%m%d%h%m%s3282 ", Time.localtime ()) param={' Ajax_callback ':' true ',' Ajax_callbacktype ':' Mtime.Library.Services ',' Ajax_callbackmethod ':' getmovieoverviewrating ',' ajax_callbackargument0 ':'%s'%(url[1]),' Ajax_requesturl ':'%s'%(url[0]),' Ajax_crossdomain ':' 1 ',' t ':'%s'%T} Rank_url= ' Http://service.library.mtime.com/Movie.api? 'Rank_content=  Self. Downloader.download (Rank_url, param) data=  Self. Parser.parser_json (Rank_url, rank_content) Self. Output.output_end ()except ExceptionE:Print("Crawl failed")if __name__ == ' __main__ ': Spier=Spidermain () Spier.crawl (' http://theater.mtime.com/China_Jiangsu_Province_Nanjing/')
Reference:

Crawl Time Network film review

Python Crawler (20) _ Dynamic Crawl Movie review information

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.