#-*-Coding:utf-8-*-# Define Your item pipelines here## Don ' t forget to add your pipeline to the Item_pipelines setting # see:https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport Pymongoimport pymysqlfrom scrapy Import Requestfrom scrapy.exceptions Import dropitemfrom scrapy.pipelines.images import Imagespipelineclass Images360pipeline (object): Def process_item (self, item, spider): Return item# MONGO Dbclass mongopipeline (object ): Def __init__ (self, Mongo_url, mongo_db): Self.mongo_url = Mongo_url self.mongo_db = mongo_db @class Method Def from_crawler (CLS, crawler): Return CLS (Mongo_url=crawler.settings.get (' Mongo_url '), Mongo_db=crawler.settings.get (' mongo_db ')) def open_spider (self, spider): Self.client = Pymongo. Mongoclient (self.mongo_url) self.db = self.client[self.mongo_db] def process_item (self, item, spider): SE Lf.db[item.collection].insert (Dict (item)) Return ITem def close_spider (Self, Spider): Self.client.close () # Mysqlclass Mysqlpipeline (object): Def __init__ (self, Host, database, user, password, port): Self.host = host Self.database = Database Self.user = user Self.password = password Self.port = Port @classmethod def from_crawler (CLS, crawler): Return CLS (Host=crawler.settings.get (' Mysql_host '), Database=crawler.settings.get (' mysql_database '), User=crawler.settings.get (' Mysql_user '), Password=crawler.settings.get (' Mysql_password '), PORT=CR Awler.settings.get (' Mysql_port ')) def open_spider (self, spider): self.db = Pymysql.connect (Self.host, SE Lf.user, Self.password, self.database, charset= ' UTF8 ', port=self.port) self.cursor = Self.db.cursor () def close_ Spider (Self, Spider): Self.db.close () def process_item (self, item, spider): data = Dict (item) keys = ', '. Join (Data.keys ()) Value = ', '. Join (['%s '] * LEN (data)) sql = ' insert into%s (%s) values (%s) '% (item.table, keys, value) Self.cursor.execute (SQL, tuple (data.values ())) Self.db.commit () return item# download Picture class Imagepipeline (Im Agespipeline): Def file_path (self, request, Response=none, info=none): url = request.url file_name = URL. Split ('/') [-1] return file_name # If the picture download fails, do not save the database, Images_store = ' saved file name such as:./images ' def item_completed (SE LF, results, item, info): image_paths = [x[' path '] for OK, X-results if OK] if not image_paths: Raise Dropitem (' Image downloaded Failed ') return item def get_media_requests (self, item, info): Yield R Equest (item[' url ')
settings.py Configuration
# only part of the code is listed, first executed imagepipelineitem_pipelines = { ' images360.pipelines.ImagePipeline ': Images360.pipelines.MongoPipeline ': 301, ' images360.pipelines.MysqlPipeline ': 302,}max_page = 50mongo_url = ' localhost ' mongo_db = ' images360 ' bot_name = ' images360 ' mysql_host = ' localhost ' mysql_database = ' images360 ' Mysql_user = ' Root ' Mysql_password = ' 123456 ' mysql_port = ' 3306 ' # download picture save path Image_store = './images ' spider_modules = [' Images360.spiders ']newspider_module = ' images360.spiders ' # Crawl responsibly by identifying yourself (and your website) o n the user-agent# user_agent = ' images360 (+http://www.yourdomain.com) ' # Obey robots.txt Rulesrobotstxt_obey = False
Scrapy save MySQL or MONGO, and picture download saved