Python爬蟲爬取豆瓣電影名稱和連結，分別存入txt，excel和資料庫

最後更新：2016-11-26 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

標籤：依賴 app html 中文 value 需要 div asa sheet

前提條件是python操作excel和資料庫的環境配置是完整的，這個需要在python中安裝匯入相關依賴包；

實現的具體代碼如下：

#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib
import urllib2
import sys
import re
import ssl
import openpyxl
import MySQLdb
import time

#from bs4 import BeautifulSoup

#修改系統預設編碼為utf-8
reload(sys)
sys.setdefaultencoding("utf-8")
ssl._create_default_https_context = ssl._create_unverified_context

#建立全域列表格儲存體資料，存放電影名字和連結地址
nameLists = []
linkLists = []

#搜尋豆瓣top100電影，儲存成檔案、excel、資料庫
class TopMove:
#初始化
def __init__(self):
#self.page = page
#self.nameList = []
#self.linkList = []
self.URL = ‘https://movie.douban.com/top250?start=‘

def GetHTML(self,page):
#for page in rang(10):
try:
url = self.URL + str(page * 25)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
html = response.read().decode(‘utf-8‘)
#print html
return html
#連結報錯的原因
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u‘連結豆瓣電影失敗，錯誤原因：‘,e.reason
return None

def GetList(self):
#nameLists = []
#linkLists = []
for page in range(10):
print "正在擷取電影列表" + str(page+1)
#print str(page)
html = self.GetHTML(page)
#因為title的電影名有些存在兩個title標籤，所以就在img中去正則匹配
name = re.compile(‘<img alt="(.*?)".*?>‘,re.S)
link = re.compile(‘<div class="hd">.*?<a.*?href="(.*?)".*?>.*?</a>‘,re.S)

nameList = re.findall(name,html)
linkList = re.findall(link,html)
for name in nameList:
#剔除英文名包含“/”
if name.find(‘/‘) == -1:
nameLists.append(name)
for link in linkList:
linkLists.append(link)
#nameLists.append(nameList[0].strip())
#linkLists.append(linkList[0].strip())
#print nameList
#print linkList
print "擷取完畢"
return nameLists,linkLists

#儲存為文字檔
def save_Text(self):
#List = []
#List = self.GetList(page)
try:
f = open(‘D:\learn\date.txt‘,‘a‘)

for i in range(250):
#迴圈寫入名稱和地址
f.write(nameLists[i])
f.write(‘\t‘*3)
f.write(linkLists[i])
f.write(‘\n‘)
#關閉檔案
f.close()
except Exception as e:
print e

print u"檔案儲存體結束"

#儲存為excel格式
def save_Excel(self):
#List = []
#List = self.GetList()

try:
#建立workbook
wb = openpyxl.Workbook()
#去工作表的sheet頁
sheet = wb.get_active_sheet()
#sheet頁命名
sheet.title = ‘Move Top 250‘
for i in range(1,251):
one = ‘a‘ + str(i) #a1,a列
two = ‘b‘ + str(i) #b2,b列
sheet[one] = nameLists[i-1]
sheet[two] = linkLists[i-1]
#print nameLists[i-1]
#print linkLists[i-1]
#儲存檔案格式，檔案名稱為中文

wb.save(ur‘D:/learn/豆瓣電影TOP250.xlsx‘)

except Exception as e:
print e
print ‘Excel 檔案儲存體結束‘

#儲存到資料庫中本地
def save_Mysql(self):
#List = []
#List = self.GetList()
try:
#連結資料庫
conn = MySQLdb.connect(
host=‘localhost‘,
port=3306,
user=‘root‘,
passwd=‘lebb123‘,
db=‘pytest‘,
charset=‘utf8‘
)
#擷取操作遊標
cursor = conn.cursor()
print ‘Connecting to MYSQL Success‘
#如果表存在就刪除
cursor.execute(‘Drop table if EXISTS MovieTop‘)
time.sleep(3)
#建立一個資料庫表
cursor.execute(
"""create table if not EXISTS MovieTop(
id int(4) not null primary key auto_increment,
movieName varchar(200),
link varchar(200));"""
)
for i in range(250):
#插入資料庫資料sql
sql = ‘insert into MovieTop(movieName,link) VALUES (%s,%s)‘
param = (nameLists[i],linkLists[i])
#print nameLists[i],linkLists[i]
#執行SQL
cursor.execute(sql,param)
#提交到資料庫執行
conn.commit()
cursor.close()
conn.close()
except Exception as e:
print e
print "Data Success Save in MYSQL"

def Start(self):

self.GetList()
self.save_Text()
self.save_Excel()
#wb = self.save_Excel()
self.save_Mysql()

dytop = TopMove()
dytop.Start()

Python爬蟲爬取豆瓣電影名稱和連結，分別存入txt，excel和資料庫

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Python爬蟲爬取豆瓣電影名稱和連結，分別存入txt，excel和資料庫

聯繫我們

熱門內容

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Python爬蟲爬取豆瓣電影名稱和連結，分別存入txt，excel和資料庫

聯繫我們

熱門內容

熱門主題

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support