D:
Go to D drive.
Scrapy Startproject Douban
Create a Watercress Project
CD Douban
Enter Project
Scrapy Genspider Douban_spider movie.douban.com
Creating crawlers
Edit items.py:
#-*-Coding:utf-8-*-
# Define Here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
Import Scrapy
Class Doubanitem (Scrapy. Item):
# define the fields for your item here is like :
# name = Scrapy. Field ()
Serial_number = Scrapy. Field ()
# serial Number
Movie_name = Scrapy. Field ()
# The name of the movie
introduce = Scrapy. Field ()
# Introduction to the movie
Star = Scrapy. Field ()
# Star
Evaluate = Scrapy. Field ()
# Number of comments on the movie
depict = Scrapy. Field ()
# Description of the movie
Edit douban_spider.py:
#-*-Coding:utf-8-*-
Import Scrapy
From Douban.itemsImport Doubanitem
Class Doubanspiderspider (Scrapy. Spider):
Name =' Douban_spider '
# Crawler's name #
Allowed_domains = [' Movie.douban.com ']
# Allowed domain names
Start_urls = [' https://movie.douban.com/top250 ']
# Engine Entry URL, throw it inside the scheduler
DefParseSelf, response):
# Default Parsing method
Movie_list = Response.xpath (".//*[@id = ' content ']/div/div[1]/ol/li")
# The list of movies shown on the first page
For I_itemIn Movie_list:
# Looping Movie Entries
Douban_item = Doubanitem ()
# Bring the items.py file in
douban_item["Serial_number"] = I_item.xpath (".//div/div[1]/em/text ()"). Extract_first ()
douban_item["Movie_name"] = I_item.xpath (".//div/div[2]/div[1]/a/span[1]/text ()"). Extract_first ()
Content = I_item.xpath (".//div/div[2]/div[2]/p[1]/text ()"). Extract ()
For I_contentIn content:
# handling the wrapped data in the introduction of the movie
content_s ="". Join (I_content.split ())
douban_item["Introduce"] = content_s
douban_item["Star"] = I_item.xpath (".//div/div[2]/div[2]/div/span[2]/text ()"). Extract_first ()
douban_item["Evaluate"] = I_item.xpath (".//div/div[2]/div[2]/div/span[4]/text ()"). Extract_first ()
douban_item[ "depict"] = I_item.xpath (
yield Douban_item
# need to yield the data to pipelines.
Next_link = Response.xpath ( # parse the next page, taking the XPath
if next_link:
Next_link = Next_link[0]
Yield scrapy. Request ( "https://movie.douban.com/top250" + next_link, callback=self.parse)
New mysqlpipline.py:
From PymysqlImport Connect
From DoubanImport settings
Class Mysqlpipeline (Object):
Def__INIT__ (Self):
Self.connect = Connect (
Host=settings.host,
Port=settings.port,
Db=settings.db,
User=settings.user,
passwd=settings.passwd,
charset=' UTF8 ',
Use_unicode=True)
# Connect to Database
Self.cursor =Self.connect.cursor ()
# get an action cursor using the cursor () method
DefProcess_item (Self, item,Spider):
Self.cursor.execute (
"" "INSERT into Douban (Serial_number, Movie_name, introduce, star, evaluate, depict)
Value (%s,%s,%s,%s,%s,%s) "" ",
(item[' Serial_number '],
item[' Movie_name '],
item[' Introduce '],
item[' Star '],
item[' Evaluate '],
Item[ ' depict ']
)
# executes the SQL statement, the field defined in Item and table fields one by one corresponds to
self.connect.commit ()
# commit
return item
# return item
def close_spider (selfspider):
self.cursor.close ()
# close cursor
self.connect.close ()
# Close Database connection
To modify the settings.py configuration file:
The 19th line is modified to:
# Crawl responsibly by identifying yourself (and your website) on the User-agent
' mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) gecko/20100101 firefox/52.0 '
# set Browser proxy
The 69th line is modified to:
Item_pipelines = {
,
}
# Enable Pipeline
Add MySQL database configuration at the end of the file:
' 192.168.1.23 ' # Database Address
3306 # Database Port
' Scrapy ' # database name
# Database User Name
# Database Password
CREATE DATABASE scrapy;
Create a database
CREATE TABLE ' Douban ' (
' ID ' INT (one) not NULL auto_increment,
' Serial_number ' INT (one) DEFAULT NULL COMMENT ' ordinal ',
' Movie_name ' VARCHAR DEFAULT NULL COMMENT ' movie name ',
' Introduce ' VARCHAR DEFAULT NULL COMMENT ' movie Introduction ',
' Star ' VARCHAR (the DEFAULT NULL COMMENT ' stars '),
' Evaluate ' VARCHAR DEFAULT NULL COMMENT ' movie Comment number ',
' Depict ' VARCHAR DEFAULT NULL COMMENT ' movie description ',
PRIMARY KEY (' id ')
) Engine=innodb DEFAULT Charset=utf8 COMMENT ' watercress table ';
Create a table
Scrapy Crawl Douban_spider--nolog
Run crawlers (do not print logs)
Scrapy crawl the Watercress movie and deposit it into MySQL database