For example, take the information on the live stream of the Betta fish:
URL Address: http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0
Crawl fields: Room ID, room name, image link, picture path stored locally, nickname, online number, city
1.items.py
#-*-Coding:utf-8-*-# Define Here the models for your scraped items## see documentation in:# HTTP://DOC.SCRAPY.ORG/EN/L Atest/topics/items.htmlimport scrapyclass Douyuspideritem (scrapy. Item): # define the fields for your item here like: # Room ID room_id = scrapy. Field () # room name room_name = scrapy. Field () # picture link vertical_src = scrapy. Field () # stores the image's local address image_path = scrapy. Field () # nickname nickname = Scrapy. Field () # online Number = Scrapy. Field () # city anchor_city = scrapy. Field ()
2.spiders/douyu.py
#-*-Coding:utf-8-*-import scrapyfrom douyuspider.items import douyuspideritemimport jsonclass douyuSpider (scrapy. Spider): name = ' Douyu ' allowed_domains = [' capi.douyucdn.cn '] url = ' Http://capi.douyucdn.cn/api/v1/getVerticalR oom?limit=20&offset= ' offset = 0 start_urls = [url + str (offset)] def parse (self, Response): # Whether to crawl and remove a page The token Next_flag = False data = json.loads (Response.text) ["Data"] for each in Data:item = do Uyuspideritem () # Room ID item[' room_id '] = each["room_id"] # room name item[' Room_name '] = each["Room_name"] # Picture link item[' vertical_src '] = each["Vertical_src"] # Nickname item[' nickname ' = each["nickname"] # Online people item[' online '] = each["Online"] # City item[' anchor_city '] = each["Anchor_city"] Next_flag = True Yield Item # Determine if you want to continue crawling and removing a page If Next_flag:selF.offset + = Yield scrapy. Request (Self.url + str (self.offset), callback = Self.parse)
3.pipelines.py
#-*-Coding:utf-8-*-# Define Your item pipelines here## Don ' t forget to add your pipeline to the Item_pipelines setting # See:http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport Jsonimport Scrapyfrom Scrapy.pipelines.images Import imagespipelinefrom scrapy.utils.project import Get_project_settingsimport os# JSON file for storing information middleware class Douyuspiderpipeline (object): Def __init__ (self): Self.file = open ("Betta. JSON", "w", encoding = "Utf-8") Self.first_flag = True def process_item (self, item, spider): if Self.first_flag:sel F.first_flag = False content = "[\ n" + json.dumps (Dict (item), Ensure_ascii = False) Else:con Tent = ", \ n" + json.dumps (Dict (item), ENSURE_ASCII = False) self.file.write (content) return item def Clos E_spider (self, Spider): Self.file.write ("\ n]") self.file.close () # Download the Image Middleware Class Imagespipeline (Imagespipelin E): Images_store = Get_project_settings (). Get ("Images_store") def get_media_requests (self, item, info): Image_url = item["VERTICAL_SRC"] yield scrapy. Request (Image_url) def item_completed (self, results, item, info): # Fix It, get the path to the picture, and determine if the path is correct, and if so, put it to image_path , Imagespipeline source analysis is visible image_path = [x["path"] for OK, x in results if OK] os.rename (self. Images_store + "/" + image_path[0], self. Images_store + "/" + item["nickname"] + ". jpg") item["image_path"] = self. Images_store + item["nickname"] + ". jpg" Return item
4.settings.py
#-*-Coding:utf-8-*-# scrapy settings for Douyuspider project## for simplicity, this file contains only settings consid Ered important or# commonly used. You can find more settings consulting the documentation:## http://doc.scrapy.org/en/latest/topics/settings.html# H ttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# http://scrapy.readthedocs.org/en/ Latest/topics/spider-middleware.htmlbot_name = ' Douyuspider ' spider_modules = [' Douyuspider.spiders ']NEWSPIDER_ MODULE = ' douyuspider.spiders ' # Crawl responsibly by identifying yourself (and your website) on the User-agent#user_agent = ' Douyuspider (+http://www.yourdomain.com) ' # Obey robots.txt Rulesrobotstxt_obey = true# Configure Maximum concurrent RE Quests performed by Scrapy (default:16) #CONCURRENT_REQUESTS = 32# Configure A delay for requests for the same website (DE fault:0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also Autothrottle Settings and Docs#downLoad_delay = 3# The download DELAY setting would honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16#concurrent_requests _PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = false# Disable Telnet Console (enabled by default) #TE lnetconsole_enabled = false# Override the default request Headers:default_request_headers = {' Accept ': ' Text/html,appli cation/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ', ' user-agent ': ' dyzb/2.290 (iPhone; IOS 9.3.4; scale/2.00) ' # ' accept-language ': ' en ',}# Enable or disable spider middlewares# See Http://scrapy.readthedocs.org/en/lat Est/topics/spider-middleware.html#spider_middlewares = {# ' DouyuSpider.middlewares.DouyuspiderSpiderMiddleware ': 543,#}# Enable or disable downloader middlewares# see http://scrapy.readthedocs.org/en/latest/topics/ Downloader-middleware.html#downloader_middlewares = {# ' DouyuSpider.middlewares.MyCustomDownloaderMiddleware ': 543 , #}# Enable or disable extensions# see http://scrapy.readthedocs.org/en/latest/topics/extensions.html#extensions = {# ' scrapy.extensions.telnet.TelnetConsole ': none,#}# Configure item pipelines# See Http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {' DouyuSpider.pipelines.DouyuspiderPipeline ': ' douyuSpider.pipelines.ImagesPipeline ': 200,}# Images storage location, It will then call Images_store = "Images\\" # Enable and configure the autothrottle extension (disabled by default) in pipelines.py # see H ttp://doc.scrapy.org/en/latest/topics/autothrottle.html#autothrottle_enabled = True# the initial download delay# Autothrottle_start_delay = AA The maximum download DELAY to BES set in case of high latencies#autothrottle_max_delay = 60# The average number of requests scrapy should be sending in parallel to# each remote server#autothrottle_target_concurrency = 1.0# enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = false# Enable and configure HTTP Caching (disabled by default) # See Http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#httpcache_enabled = True#HTTPCACHE_EXPIRATION_SECS = 0#httpcache_dir = ' HttpCache ' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = ' Scrapy.extensions.httpcache.FilesystemCacheStorage '
Crawler--scrapy Frame case one: mobile app grab Bag