Python crawler crawls the name and link of the Watercress movie, respectively, into Txt,excel and database

Source: Internet
Author: User

The precondition is that the environment configuration of the Python operation Excel and the database is complete, this need to install the import dependent package in Python;

The implementation of the specific code is as follows:

#!/usr/bin/python
#-*-Coding:utf-8-*-

Import Urllib
Import Urllib2
Import Sys
Import re
Import SSL
Import OPENPYXL
Import MySQLdb
Import time

#from BS4 Import BeautifulSoup

#修改系统默认编码为utf-8
Reload (SYS)
Sys.setdefaultencoding ("Utf-8")
Ssl._create_default_https_context = Ssl._create_unverified_context

#创建全局列表存储数据, store movie name and link address
NameLists = []
Linklists = []

#搜索豆瓣top100电影, Save as file, Excel, database
Class Topmove:
#初始化
def __init__ (self):
#self. page = page
#self. namelist = []
#self. linklist = []
Self. URL = ' https://movie.douban.com/top250?start= '

def gethtml (self,page):
#for page in Rang (10):
Try
url = self. URL + str (PAGE * 25)
Request = Urllib2. Request (URL)
Response = Urllib2.urlopen (Request)
html = Response.read (). Decode (' Utf-8 ')
#print html
return HTML
#链接报错的原因
Except Urllib2. Urlerror, E:
If Hasattr (E, "Reason"):
Print u ' link watercress movie failed, error reason: ', E.reason
Return None

def GetList (self):
#nameLists = []
#linkLists = []
For page in range (10):
print "Getting movie list" + str (page+1)
#print Str (page)
html = self. Gethtml (page)
#因为title的电影名有些存在两个title标签, so just go to the regular match in img
Name = Re.compile (' ', Re. S
link = re.compile (' <div class= "HD" >.*?<a.*?href= "(. *?)". *?>.*?</a> ', Re. S

NameList = Re.findall (name,html)
linklist = Re.findall (link,html)
For name in NameList:
#剔除英文名包含 "/"
If Name.find ('/') = =-1:
Namelists.append (name)
For link in linklist:
Linklists.append (link)
#nameLists. Append (Namelist[0].strip ())
#linkLists. Append (Linklist[0].strip ())
#print NameList
#print linklist
Print "Get Finished"
Return namelists,linklists

#保存为文本文件
def save_text (self):
#List = []
#List = self. GetList (page)
Try
f = open (' D:\learn\date.txt ', ' a ')

For I in range (250):
#循环写入名称和地址
F.write (Namelists[i])
F.write (' \ t ')
F.write (Linklists[i])
F.write (' \ n ')
#关闭文件
F.close ()
Except Exception as E:
Print E

Print U "File Store End"

#保存为excel格式
def save_excel (self):
#List = []
#List = self. GetList ()

Try
#新建workbook
WB = OPENPYXL. Workbook ()
#去工作表的sheet页
Sheet = Wb.get_active_sheet ()
#sheet页命名
Sheet.title = ' Move Top 250 '
For I in Range (1,251):
one = ' a ' + str (i) #a1, column A
II = ' B ' + str (i) #b2, column B
Sheet[one] = namelists[i-1]
Sheet[two] = linklists[i-1]
#print Namelists[i-1]
#print Linklists[i-1]
#保存文件格式, file name is Chinese

Wb.save (Ur ' d:/learn/watercress film top250.xlsx ')

Except Exception as E:
Print E
print ' Excel File Store end '

#保存到数据库中本地
def save_mysql (self):
#List = []
#List = self. GetList ()
Try
#链接数据库
conn = MySQLdb.connect (
host= ' localhost ',
port=3306,
User= ' Root ',
Passwd= ' lebb123 ',
db= ' Pytest ',
charset= ' UTF8 '
)
#获取操作游标
cursor = Conn.cursor ()
print ' Connecting to MYSQL Success '
#如果表存在就删除
Cursor.execute (' Drop table if EXISTS movietop ')
Time.sleep (3)
#创建一个数据库表
Cursor.execute (
"" "CREATE table if not EXISTS movietop (
ID int (4) NOT null primary key auto_increment,
Moviename varchar (200),
Link varchar (200)); ""
)
For I in range (250):
#插入数据库数据sql
sql = ' INSERT INTO Movietop (moviename,link) VALUES (%s,%s) '
param = (Namelists[i],linklists[i])
#print Namelists[i],linklists[i]
#执行SQL
Cursor.execute (Sql,param)
#提交到数据库执行
Conn.commit ()
Cursor.close ()
Conn.close ()
Except Exception as E:
Print E
Print "Data Success Save in MYSQL"


def Start (self):

Self. GetList ()
Self.save_text ()
Self.save_excel ()
#wb = Self.save_excel ()
Self.save_mysql ()


Dytop = Topmove ()
Dytop. Start ()

Python crawler crawls the name and link of the Watercress movie, respectively, into Txt,excel and database

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.