python crawler0723.py

來源:互聯網
上載者:User

#!/usr/env  python
#-*- coding: utf-8  -*-
import urllib 
import urllib2 
import random 
import requests
import os,sys 
import MySQLdb
from sgmllib import SGMLParser 
import re
num=0
def main():
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")
conn.query("set names utf8")
except Exception,e:
print e
sys.exit()
cursor=conn.cursor() 
for k in range(1,2574):
url="http://apk.gfan.com/apps_7_1_"+str(k)+".html"
html=requests.get(url)
result=html.content
pattern=re.compile('<a href="([http://apk.gfan.com]?/Product/App\d{1,8}.html)"')
dataresult=re.findall(pattern,result)
dataresult=list(set(dataresult))
for i in dataresult:
t="http://apk.gfan.com"+i
print t
html=requests.get(t)
result=html.content
pattern=re.compile('<div class="appdiscrib">[\s\S]*?<h4>(.+?)</h4>')
data0=re.findall(pattern,result)
#print data0[0]
pattern=re.compile('版 本 號(.+?)</li>')
data1=re.findall(pattern,result)
pattern=re.compile('開 發 者(.+?)</li>')
data2=re.findall(pattern,result)
pattern=re.compile('發布時間(.+?)</li>')
data3=re.findall(pattern,result)
pattern=re.compile('檔案大小(.+?)</li>')
data4=re.findall(pattern,result)
pattern=re.compile('支援韌體(.+?)</li>')
data5=re.findall(pattern,result)
pattern=re.compile('應用介紹</h3>[\s\S]*?<div class="intro">([\s\S]*?)</div>')
data6=re.findall(pattern,result)
for items in data6:
pass#print re.sub('<br />',' ',items)
sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
for items in data6:

if(data5):
values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
else:
values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
#print values
#print sql % values
cursor.execute(sql,values)
conn.commit()
pattern=re.compile(' <div class="appTitle clearfix">[\s\S]*?<img src=(.+?)/>')
data=re.findall(pattern,result)
for j in data:
print j
      #temp = urllib2.urlopen(i[10:])
        # 這個是儲存函數,第一個參數是地址,第二個是儲存的檔案名稱,讓地址的倒數8位,當做檔案名稱
#urllib.urlretrieve(j[1:-2], j[-40:]) 
temp=requests.get(j[1:-2])
global num
f=file("picture/"+str(num),"w+")
num=num+1
print num
f.write(temp.content)

#sql="select * from address"
#cursor.execute(sql)
#conn.commit()
#finalresult=cursor.fetchall()
#if finalresult:
#for x in finalresult:
#pass #print x[0:]
cursor.close()
conn.close()
f.close()
    
if  __name__=="__main__":
       main()

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.