Create a crawler project Douban
scrapy startproject douban
Set the items.py file to store the data type and field names you want to save
# -*- coding: utf-8 -*-import scrapyclass DoubanItem(scrapy.Item): title = scrapy.Field() # 内容 content = scrapy.Field() # 评分 rating_num = scrapy.Field() # 简介 quote = scrapy.Field()
Set up a crawler file doubanmovies.py
#-*-Coding:utf-8-*-import scrapyfrom douban.items import doubanitemclass doubanmoviesspider (scrapy. Spider): name = ' doubanmovies ' allowed_domains = [' movie.douban.com '] offset = 0 url = ' https://movie.douban.c om/top250?start= ' start_urls = [url + str (offset)] def parse (self, Response): # print (' * ' *60) # print ( Response.url) # print (' * ' *60) item = Doubanitem () info = Response.xpath ("//div[@class = ' info ']") For each in info:item[' title ' = Each.xpath (".//span[@class = ' title '][1]/text ()"). Extract () item[' Content '] = Each.xpath (".//div[@class = ' bd ']/p[1]/text ()"). Extract () item[' rating_num '] = Each.xpath (".//span[@c lass= ' Rating_num ']/text () "). Extract () item[' quote '] = each. XPath (".//span[@class = ' inq ']/text () "). Extract () Yield Item # Print (item) Self.offset + = if self.offset <= 250:yield SCR APY. Request (Self.url + str (self.offset), CalLback=self.parse)
-
To set up the pipeline file and use the MongoDB database to save the crawled data. Key section
#-*-coding:utf-8-*-from scrapy.conf import settingsimport pymongoclass doubanpipeline (object): D EF __init__ (self): Self.host = settings[' mongodb_host '] self.port = settings[' Mongodb_port '] def process_ Item (self, item, spider): # Create a MongoDB Client connection object, which takes the host and port parameters from the settings.py file where MongoDB resides, and can write directly to the host and port self.client = Pymongo. Mongoclient (self.host,self.port) # CREATE database Douban self.mydb = self.client[' Douban '] # CREATE table in database Douban dou Banmovies # Convert Dictionary-like data to Phthon dictionary format content = Dict (item) # Add data to the table Self.mysheetname.insert (con Tent) return item
Setting up the settings.py file
# -*- coding: utf-8 -*-BOT_NAME = ‘douban‘SPIDER_MODULES = [‘douban.spiders‘]NEWSPIDER_MODULE = ‘douban.spiders‘USER_AGENT = ‘Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;‘# Configure a delay for requests for the same website (default: 0)# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)COOKIES_ENABLED = False# Configure item pipelines# See https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { ‘douban.pipelines.DoubanPipeline‘: 300,}# mongodb数据库设置变量MONGODB_HOST = ‘127.0.0.1‘MONGODB_PORT = 27017
Terminal testing
scrapy crawl douban
This blog Park code fragment indentation, do not have to use 4 spaces to be able to do? I found that only 4 spaces can be used to resolve indentation of code blocks such as
Use MongoDB to save data for crawling watercress movies