初識python之 APP store熱門排行榜 蜘蛛抓取(二)

來源:互聯網
上載者:User

標籤:style   blog   http   color   os   io   strong   for   ar   

#-*- coding: utf-8 -*-import urllib2import urllibimport reimport timeimport MySQLdbimport time,datetime#from datetime import date#----------- APP store 熱門排行榜 -----------class Spider_Model:    def __init__(self):        self.page = 1        self.pages = []        self.enable = False    def startWork(self,url,tabName):        nowtime = int(time.time())        content = self.GetCon(url)        oneItems =  self.Match(content) #匹配一級參數        time.sleep(1)        for index,item in enumerate(oneItems):            content_two = self.GetCon(item[1])            twoItems = self.Match_two(content_two)            oneItems[index].append([twoItems[0],twoItems[1]])            if oneItems[index][6][0] == ‘0‘:                fabutime = ‘0‘            else:                fabutime=int(time.mktime(time.strptime(oneItems[index][6][0].strip(),‘%Y年%m月%d日‘)))            sql = "INSERT INTO "+tabName+"(`rank`,`detailurl`,`logo`,`name`,`type`,`appid`,`appstoretime`,`compatible`,`ctime`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"%(‘"‘+oneItems[index][0]+‘"‘,‘"‘+oneItems[index][1]+‘"‘,‘"‘+oneItems[index][2]+‘"‘,‘"‘+oneItems[index][3]+‘"‘,‘"‘+oneItems[index][4]+‘"‘,‘"‘+oneItems[index][5]+‘"‘,fabutime,‘"‘+oneItems[index][6][1]+‘"‘,nowtime)            self.contentDb(sql)            time.sleep(1)    def GetCon(self,url):        myUrl = url        headers = {‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘,‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘}        //網站禁止爬蟲解決方案加上上面的代碼 類比瀏覽器訪問  
     req = urllib2.Request(myUrl, headers = headers) global myResponse try: myResponse = urllib2.urlopen(req) except urllib2.HTTPError, e: print e.fp.read()
     //異常處理必須加 否則就算類比了瀏覽器 也會返回 403 原因不知道...... myPage = myResponse.read() #encode的作用是將unicode編碼轉換成其他編碼的字串 #decode的作用是將其他編碼的字串轉換成unicode編碼 #unicodePage = myPage.decode(‘utf-8‘).encode(‘gbk‘,‘ignore‘) #unicodePage = myPage.decode(‘utf-8‘,‘ignore‘) return myPage def Match(self,con): # 找出所有class="content"的div標記 #re.S是任意匹配模式,也就是.可以匹配分行符號 pattenA = re.compile(r‘<section class="section apps grid">(.*?)</section>‘,re.U|re.S) pattenB = re.compile(r‘<li><strong>(.*?).</strong><a href="(.*?)".*?><img src="(.*?)".*?></a><h3><a.*?>(.*?)</a></h3><h4><a.*?>(.*?)</a></h4><a.*?>.*?</a></li>‘,re.U|re.S) match = re.findall(pattenA,con) myItems = re.findall(pattenB,match[0]) items = [] for item in myItems: items.append([item[0].replace("\n",""),item[1].replace("\n",""),item[2].replace("\n",""),(item[3].replace("\n","")).split(‘-‘)[0],item[4].replace("\n",""),(item[1].split(‘id‘)[1]).split(‘?‘)[0]]) return items def Match_two(self,con): pattenTwoA = re.compile(r‘<li.*?class="release-date"><span.*?>.*?</span>(.*?)</li>‘,re.U|re.S) pattenTwoB = re.compile(r‘<span.*?class="app-requirements">.*?</span>(.*?)</p>‘,re.U|re.S) matchTwoA = self.is_empty(re.findall(pattenTwoA,con)) matchTwoB = self.is_empty(re.findall(pattenTwoB,con)) itemsTwo = [matchTwoA,matchTwoB] return itemsTwo def is_empty(self,param): if len(param): res = param[0] else: res = ‘0‘ return res def contentDb(self,sql): try: conn = MySQLdb.connect(host="主機", user="使用者", passwd="密碼", db="表名",charset=‘utf8‘) cur = conn.cursor() result = cur.execute(sql) conn.commit() except MySQLdb.Error,e: print "Mysql Error %d: %s" %(e.args[0],e.args[1])addArr = [["http://www.apple.com/jp/itunes/charts/free-apps/",‘cg_jp_free‘], ["http://www.apple.com/jp/itunes/charts/paid-apps/",‘cg_jp_paid‘]]myModel = Spider_Model()for val in addArr: myModel.startWork(val[0],val[1])

 

初識Python 代碼寫的有點爛,自製罪孽深重......

python版本:2.7.5  測試環境:Linux、Windows

望高手拍磚 帶我一起裝逼!一起飛!

初識python之 APP store熱門排行榜 蜘蛛抓取(二)

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.