The precondition is that the environment configuration of the Python operation Excel and the database is complete, this need to install the import dependent package in Python;
The implementation of the specific code is as follows:
#!/usr/bin/python
#-*-Coding:utf-8-*-
Import Urllib
Import Urllib2
Import Sys
Import re
Import SSL
Import OPENPYXL
Import MySQLdb
Import time
#from BS4 Import BeautifulSoup
#修改系统默认编码为utf-8
Reload (SYS)
Sys.setdefaultencoding ("Utf-8")
Ssl._create_default_https_context = Ssl._create_unverified_context
#创建全局列表存储数据, store movie name and link address
NameLists = []
Linklists = []
#搜索豆瓣top100电影, Save as file, Excel, database
Class Topmove:
#初始化
def __init__ (self):
#self. page = page
#self. namelist = []
#self. linklist = []
Self. URL = ' https://movie.douban.com/top250?start= '
def gethtml (self,page):
#for page in Rang (10):
Try
url = self. URL + str (PAGE * 25)
Request = Urllib2. Request (URL)
Response = Urllib2.urlopen (Request)
html = Response.read (). Decode (' Utf-8 ')
#print html
return HTML
#链接报错的原因
Except Urllib2. Urlerror, E:
If Hasattr (E, "Reason"):
Print u ' link watercress movie failed, error reason: ', E.reason
Return None
def GetList (self):
#nameLists = []
#linkLists = []
For page in range (10):
print "Getting movie list" + str (page+1)
#print Str (page)
html = self. Gethtml (page)
#因为title的电影名有些存在两个title标签, so just go to the regular match in img
Name = Re.compile (' ', Re. S
link = re.compile (' <div class= "HD" >.*?<a.*?href= "(. *?)". *?>.*?</a> ', Re. S
NameList = Re.findall (name,html)
linklist = Re.findall (link,html)
For name in NameList:
#剔除英文名包含 "/"
If Name.find ('/') = =-1:
Namelists.append (name)
For link in linklist:
Linklists.append (link)
#nameLists. Append (Namelist[0].strip ())
#linkLists. Append (Linklist[0].strip ())
#print NameList
#print linklist
Print "Get Finished"
Return namelists,linklists
#保存为文本文件
def save_text (self):
#List = []
#List = self. GetList (page)
Try
f = open (' D:\learn\date.txt ', ' a ')
For I in range (250):
#循环写入名称和地址
F.write (Namelists[i])
F.write (' \ t ')
F.write (Linklists[i])
F.write (' \ n ')
#关闭文件
F.close ()
Except Exception as E:
Print E
Print U "File Store End"
#保存为excel格式
def save_excel (self):
#List = []
#List = self. GetList ()
Try
#新建workbook
WB = OPENPYXL. Workbook ()
#去工作表的sheet页
Sheet = Wb.get_active_sheet ()
#sheet页命名
Sheet.title = ' Move Top 250 '
For I in Range (1,251):
one = ' a ' + str (i) #a1, column A
II = ' B ' + str (i) #b2, column B
Sheet[one] = namelists[i-1]
Sheet[two] = linklists[i-1]
#print Namelists[i-1]
#print Linklists[i-1]
#保存文件格式, file name is Chinese
Wb.save (Ur ' d:/learn/watercress film top250.xlsx ')
Except Exception as E:
Print E
print ' Excel File Store end '
#保存到数据库中本地
def save_mysql (self):
#List = []
#List = self. GetList ()
Try
#链接数据库
conn = MySQLdb.connect (
host= ' localhost ',
port=3306,
User= ' Root ',
Passwd= ' lebb123 ',
db= ' Pytest ',
charset= ' UTF8 '
)
#获取操作游标
cursor = Conn.cursor ()
print ' Connecting to MYSQL Success '
#如果表存在就删除
Cursor.execute (' Drop table if EXISTS movietop ')
Time.sleep (3)
#创建一个数据库表
Cursor.execute (
"" "CREATE table if not EXISTS movietop (
ID int (4) NOT null primary key auto_increment,
Moviename varchar (200),
Link varchar (200)); ""
)
For I in range (250):
#插入数据库数据sql
sql = ' INSERT INTO Movietop (moviename,link) VALUES (%s,%s) '
param = (Namelists[i],linklists[i])
#print Namelists[i],linklists[i]
#执行SQL
Cursor.execute (Sql,param)
#提交到数据库执行
Conn.commit ()
Cursor.close ()
Conn.close ()
Except Exception as E:
Print E
Print "Data Success Save in MYSQL"
def Start (self):
Self. GetList ()
Self.save_text ()
Self.save_excel ()
#wb = Self.save_excel ()
Self.save_mysql ()
Dytop = Topmove ()
Dytop. Start ()
Python crawler crawls the name and link of the Watercress movie, respectively, into Txt,excel and database