#-*-coding:utf-8-*-import urllib2import urllibimport reimport timeimport mysqldbimport time,datetime#from datetime imp ORT date#-----------APP Store leaderboard-----------class Spider_model:def __init__ (self): Self.page = 1 self. pages = [] self.enable = False def startwork (self,url,tabname): nowtime = Int (time.time ()) content = self. Getcon (URL) oneitems = self. Match (content) #匹配一级参数 Time.sleep (1) for Index,item in Enumerate (oneitems): Content_two = self. Getcon (item[1]) Twoitems = self. Match_two (Content_two) oneitems[index].append ([twoitems[0],twoitems[1]]) if oneitems[index][6][0] = = ' 0 ': fabutime = ' 0 ' else:fabutime=int (time.mktime (Time.strptime (Oneitems[index] [6] [0].strip (), '%y years%m months%d days '))) sql = "INSERT into" +tabname+ "(' Rank ', ' detailurl ', ' logo ', ' name ', ' type ', ' AppID ', ' apps Toretime ', ' compatible ', ' CTime ') VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s) "% ('" ' +oneitems[index][0]+ ' "', '" ' +oneitems[index][1]+ ' "', '" ' +oneitems[index][2]+ ' "', '" ' +oneitems[index ') [3]+ ' ', ' ' ' +oneitems[index][4]+ ', ' ', ' ' ' ' +oneitems[index][5]+ ', ' ', Fabutime, ' ' +oneitems[index][6][1]+ ' "', Nowtime ') s ELF.CONTENTDB (SQL) Time.sleep (1) def getcon (self,url): Myurl = URLheaders = {' user-agent ': ' mozilla/5.0 (X11; Linux x86_64) applewebkit/537.11 (khtml, like Gecko) chrome/23.0.1271.64 safari/537.11 ', ' Accept ': ' Text/html, application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 '} //Website no crawler solution plus the code above simulates browser access
req = Urllib2. Request (myurl, headers = headers) Global Myresponse Try:myresponse = Urllib2.urlopen (req) Except Urllib2. Httperror, E:print E.fp.read ()
//exception handling must be added otherwise even if impersonating the browser will return 403 reasons do not know ... MyPage = Myresponse.read () #encode的作用是将unicode编码转换成其他编码的字符串 #decode的作用是将其他编码的字符串转换成unicode编码 #unicodePage = Mypage.decode (' Utf-8 '). Encode (' GBK ', ' ignore ') #unicodePage = Mypage.decode (' utf-8 ', ' ignore ') Return MyPage def Match (Self,con): # Find all the class= "content" div tags #re. S is any matching pattern, that is. Can match line break Pattena = Re.compile (R ' <section class= "section apps Grid" > (. *?) </section> ', Re. U|re. S) Pattenb = Re.compile (R ' <li><strong> (. *?). </strong><a href= "(. *?)". *?></a>The first knowledge of Python code is a bit rotten, self-made sinful ...
Python version: 2.7.5 test environment: Linux, Windows
Look at the master shot brick with me to install force! A takeoff!
The first Python APP store leaderboard Spider Crawl (ii)