Save data with Pymongo
Crawl the top250movie.douban.com/top250 film data from the Watercress movie and save it in MongoDB.
items.py
class DoubanspiderItem(scrapy.Item): # 电影标题 title = scrapy.Field() # 电影评分 score = scrapy.Field() # 电影信息 content = scrapy.Field() # 简介 info = scrapy.Field()
spiders/douban.py
Import ScrapyFrom Doubanspider.itemsImport DoubanspideritemClassDoubanspider(Scrapy. Spider): name ="Douban" allowed_domains = ["Movie.douban.com"] start =0 url =' https://movie.douban.com/top250?start= ' end =' &filter= ' start_urls = [url + str (start) + end]DefParse(Self, response): item = Doubanspideritem () Movies = Response.xpath ("//div[@class =\ ' info\ ']")For eachIn Movies:title = Each.xpath (' div[@class = ' HD ']/a/span[@class = ' title ']/text () '). Extract () content = Each.xpath (' Div[@class = ' bd ']/p/text () '). Extract () score = Each.xpath (' Div[@class = "BD"]/div[@class = "star"]/span[@class = "Rating_num"]/text () '). Extract () info = Each.xpath (' div[@class = "BD"]/p[@class = "quote"]/span/text () '). Extract () item[' title '] = title[0] # with; Combine all the elements in the content list into a new string item[' content ' = '; '. Join (content) item[' score '] = score[0] item[' info '] = info[0] # Submit Item yield Item if SELF.S Tart <= 225:self.start + = yield scrapy. Request (Self.url + str (self.start) + self.end, callback=self.parse)
pipelines.py
From scrapy.confImport settingsImport PymongoClassDoubanspiderpipeline(object):Def__init__(self): # Get setting host name, port number, and database name host = settings[' mongodb_host '] port = settings[' mongodb_ PORT '] dbname = settings[' mongodb_dbname '] # Pymongo. Mongoclient (host, port) creates a mongodb link client = Pymongo. Mongoclient (Host=host,port=port) # Point to the specified database MDB = client[dbname] # Gets the table name that holds the data in the database Self.post = mdb[settings[' Mongodb_docname ']] def Process_item(self, item, spider): data = Dict (item) # Add to the specified table Self.post.insert (data) return item
settings.py
Bot_name =' Doubanspider ' spider_modules = [ ' doubanspider.spiders '] Newspider_module = ' doubanspider.spiders ' item_pipelines = { ' do UbanSpider.pipelines.DoubanspiderPipeline ': 300}# Crawl Responsibly by identifying yourself (and your website) on the user-agentuser_agent = ' MOZILLA/5 .0 (Macintosh; Intel Mac OS X 10_11_3) applewebkit/537.36 (khtml, like Gecko) chrome/48.0.2564.116 safari/537.36 ' # port number, Default is 27017mongodb_port = 27017# Set database name Mongodb_dbname = ' Douban ' # the table name for this data Mongodb_docname = Doubanmovies '
Run
# 首先启动数据库服务,再执行Scrapysudo mongo # 启动数据库shell在mongo shell下使用命令:# 查看当前数据库> db# 列出所有的数据库> show dbs# 连接DouBan数据库> use DouBan# 列出所有表> show collections# 查看表里的数据> db.DouBanMoives.find()
The reptile framework scrapy the data in MongoDB