Scrapy crawl the Watercress movie and deposit it into MySQL database

Source: Internet
Author: User
Tags xpath

D:
Go to D drive.

Scrapy Startproject Douban
Create a Watercress Project

CD Douban
Enter Project

Scrapy Genspider Douban_spider movie.douban.com
Creating crawlers

Edit items.py:

#-*-Coding:utf-8-*-

# Define Here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

Import Scrapy


Class Doubanitem (Scrapy. Item):
# define the fields for your item here is like :
# name = Scrapy. Field ()

Serial_number = Scrapy. Field ()
# serial Number
Movie_name = Scrapy. Field ()
# The name of the movie
introduce = Scrapy. Field ()
# Introduction to the movie
Star = Scrapy. Field ()
# Star
Evaluate = Scrapy. Field ()
# Number of comments on the movie
depict = Scrapy. Field ()
# Description of the movie



Edit douban_spider.py:

#-*-Coding:utf-8-*-
Import Scrapy
From Douban.itemsImport Doubanitem


Class Doubanspiderspider (Scrapy. Spider):
Name =' Douban_spider '
# Crawler's name #
Allowed_domains = [' Movie.douban.com ']
# Allowed domain names
Start_urls = [' https://movie.douban.com/top250 ']
# Engine Entry URL, throw it inside the scheduler

DefParseSelf, response):
# Default Parsing method
Movie_list = Response.xpath (".//*[@id = ' content ']/div/div[1]/ol/li")
# The list of movies shown on the first page
For I_itemIn Movie_list:
# Looping Movie Entries
Douban_item = Doubanitem ()
# Bring the items.py file in
douban_item["Serial_number"] = I_item.xpath (".//div/div[1]/em/text ()"). Extract_first ()
douban_item["Movie_name"] = I_item.xpath (".//div/div[2]/div[1]/a/span[1]/text ()"). Extract_first ()
Content = I_item.xpath (".//div/div[2]/div[2]/p[1]/text ()"). Extract ()
For I_contentIn content:
# handling the wrapped data in the introduction of the movie
content_s ="". Join (I_content.split ())
douban_item["Introduce"] = content_s

douban_item["Star"] = I_item.xpath (".//div/div[2]/div[2]/div/span[2]/text ()"). Extract_first ()
douban_item["Evaluate"] = I_item.xpath (".//div/div[2]/div[2]/div/span[4]/text ()"). Extract_first ()
douban_item[ "depict"] = I_item.xpath (
yield Douban_item
# need to yield the data to pipelines.

Next_link = Response.xpath ( # parse the next page, taking the XPath
if next_link:
Next_link = Next_link[0]
Yield scrapy. Request ( "https://movie.douban.com/top250" + next_link, callback=self.parse)

New mysqlpipline.py:

From PymysqlImport Connect
From DoubanImport settings


Class Mysqlpipeline (Object):
Def__INIT__ (Self):
Self.connect = Connect (
Host=settings.host,
Port=settings.port,
Db=settings.db,
User=settings.user,
passwd=settings.passwd,
charset=' UTF8 ',
Use_unicode=True)
# Connect to Database
Self.cursor =Self.connect.cursor ()
# get an action cursor using the cursor () method

DefProcess_item (Self, item,Spider):
Self.cursor.execute (
"" "INSERT into Douban (Serial_number, Movie_name, introduce, star, evaluate, depict)
Value (%s,%s,%s,%s,%s,%s) "" ",
(item[' Serial_number '],
item[' Movie_name '],
item[' Introduce '],
item[' Star '],
item[' Evaluate '],
Item[ ' depict ']
)
# executes the SQL statement, the field defined in Item and table fields one by one corresponds to
self.connect.commit ()
# commit
return item
# return item

def close_spider (selfspider):
self.cursor.close ()
# close cursor
self.connect.close ()
# Close Database connection

To modify the settings.py configuration file:

The 19th line is modified to:

# Crawl responsibly by identifying yourself (and your website) on the User-agent
' mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) gecko/20100101 firefox/52.0 '
# set Browser proxy

The 69th line is modified to:

Item_pipelines = {
,
}
# Enable Pipeline

Add MySQL database configuration at the end of the file:

' 192.168.1.23 '  # Database Address
3306 # Database Port
' Scrapy ' # database name
# Database User Name
# Database Password

CREATE DATABASE scrapy;

Create a database


CREATE TABLE ' Douban ' (
' ID ' INT (one) not NULL auto_increment,
' Serial_number ' INT (one) DEFAULT NULL COMMENT ' ordinal ',
' Movie_name ' VARCHAR DEFAULT NULL COMMENT ' movie name ',
' Introduce ' VARCHAR DEFAULT NULL COMMENT ' movie Introduction ',
' Star ' VARCHAR (the DEFAULT NULL COMMENT ' stars '),
' Evaluate ' VARCHAR DEFAULT NULL COMMENT ' movie Comment number ',
' Depict ' VARCHAR DEFAULT NULL COMMENT ' movie description ',
PRIMARY KEY (' id ')
) Engine=innodb DEFAULT Charset=utf8 COMMENT ' watercress table ';

Create a table

Scrapy Crawl Douban_spider--nolog

Run crawlers (do not print logs)

Scrapy crawl the Watercress movie and deposit it into MySQL database

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.