標籤:idt and info cti 名稱 identify each .post 合并
用Pymongo儲存資料
爬取豆瓣電影top250movie.douban.com/top250的電影資料,並儲存在MongoDB中。
items.py
class DoubanspiderItem(scrapy.Item): # 電影標題 title = scrapy.Field() # 電影評分 score = scrapy.Field() # 電影資訊 content = scrapy.Field() # 簡介 info = scrapy.Field()
spiders/douban.py
import scrapyfrom doubanSpider.items import DoubanspiderItemclass DoubanSpider(scrapy.Spider): name = "douban" allowed_domains = ["movie.douban.com"] start = 0 url = ‘https://movie.douban.com/top250?start=‘ end = ‘&filter=‘ start_urls = [url + str(start) + end] def parse(self, response): item = DoubanspiderItem() movies = response.xpath("//div[@class=\‘info\‘]") for each in movies: title = each.xpath(‘div[@class="hd"]/a/span[@class="title"]/text()‘).extract() content = each.xpath(‘div[@class="bd"]/p/text()‘).extract() score = each.xpath(‘div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()‘).extract() info = each.xpath(‘div[@class="bd"]/p[@class="quote"]/span/text()‘).extract() item[‘title‘] = title[0] # 以;作為分隔,將content列表裡所有元素合并成一個新的字串 item[‘content‘] = ‘;‘.join(content) item[‘score‘] = score[0] item[‘info‘] = info[0] # 提交item yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
pipelines.py
from scrapy.conf import settingsimport pymongoclass DoubanspiderPipeline(object): def __init__(self): # 擷取setting主機名稱、連接埠號碼和資料庫名 host = settings[‘MONGODB_HOST‘] port = settings[‘MONGODB_PORT‘] dbname = settings[‘MONGODB_DBNAME‘] # pymongo.MongoClient(host, port) 建立MongoDB連結 client = pymongo.MongoClient(host=host,port=port) # 指向指定的資料庫 mdb = client[dbname] # 擷取資料庫裡存放資料的表名 self.post = mdb[settings[‘MONGODB_DOCNAME‘]] def process_item(self, item, spider): data = dict(item) # 向指定的表裡添加資料 self.post.insert(data) return item
settings.py
BOT_NAME = ‘doubanSpider‘SPIDER_MODULES = [‘doubanSpider.spiders‘]NEWSPIDER_MODULE = ‘doubanSpider.spiders‘ITEM_PIPELINES = { ‘doubanSpider.pipelines.DoubanspiderPipeline‘ : 300 }# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENT = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36‘# MONGODB 主機迴路位址127.0.0.1MONGODB_HOST = ‘127.0.0.1‘# 連接埠號碼,預設是27017MONGODB_PORT = 27017# 設定資料庫名稱MONGODB_DBNAME = ‘DouBan‘# 存放本次資料的表名稱MONGODB_DOCNAME = ‘DouBanMovies‘
運行
啟動MongoDB資料庫需要兩個命令:mongod:是mongoDB資料庫進程本身mongo:是命令列shell用戶端sudo mongod # 首先啟動資料庫服務,再執行Scrapysudo mongo # 啟動資料庫shell在mongo shell下使用命令:# 查看當前資料庫> db# 列出所有的資料庫> show dbs# 串連DouBan資料庫> use DouBan# 列出所有表> show collections# 查看錶裡的資料> db.DouBanMoives.find()
爬蟲架構Scrapy之將資料存在Mongodb