Crawler--scrapy Frame case one: mobile app grab Bag

Source: Internet
Author: User

For example, take the information on the live stream of the Betta fish:

URL Address: http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0

Crawl fields: Room ID, room name, image link, picture path stored locally, nickname, online number, city

1.items.py

#-*-Coding:utf-8-*-# Define Here the models for your scraped items## see documentation in:# HTTP://DOC.SCRAPY.ORG/EN/L Atest/topics/items.htmlimport scrapyclass Douyuspideritem (scrapy. Item):    # define the fields for your item here like:    # Room ID    room_id = scrapy. Field ()    # room name    room_name = scrapy. Field ()    # picture link    vertical_src = scrapy. Field ()    # stores the image's local address    image_path = scrapy. Field ()    # nickname    nickname = Scrapy. Field ()    # online Number    = Scrapy. Field ()    # city    anchor_city = scrapy. Field ()

2.spiders/douyu.py

#-*-Coding:utf-8-*-import scrapyfrom douyuspider.items import douyuspideritemimport jsonclass douyuSpider (scrapy. Spider): name = ' Douyu ' allowed_domains = [' capi.douyucdn.cn '] url = ' Http://capi.douyucdn.cn/api/v1/getVerticalR oom?limit=20&offset= ' offset = 0 start_urls = [url + str (offset)] def parse (self, Response): # Whether to crawl and remove a page The token Next_flag = False data = json.loads (Response.text) ["Data"] for each in Data:item = do Uyuspideritem () # Room ID item[' room_id '] = each["room_id"] # room name item[' Room_name            '] = each["Room_name"] # Picture link item[' vertical_src '] = each["Vertical_src"] # Nickname            item[' nickname ' = each["nickname"] # Online people item[' online '] = each["Online"] # City        item[' anchor_city '] = each["Anchor_city"] Next_flag = True Yield Item # Determine if you want to continue crawling and removing a page If Next_flag:selF.offset + = Yield scrapy. Request (Self.url + str (self.offset), callback = Self.parse)

 3.pipelines.py

#-*-Coding:utf-8-*-# Define Your item pipelines here## Don ' t forget to add your pipeline to the Item_pipelines setting # See:http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport Jsonimport Scrapyfrom Scrapy.pipelines.images Import imagespipelinefrom scrapy.utils.project import Get_project_settingsimport os#  JSON file for storing information middleware class Douyuspiderpipeline (object): Def __init__ (self): Self.file = open ("Betta. JSON", "w", encoding = "Utf-8") Self.first_flag = True def process_item (self, item, spider): if Self.first_flag:sel F.first_flag = False content = "[\ n" + json.dumps (Dict (item), Ensure_ascii = False) Else:con Tent = ", \ n" + json.dumps (Dict (item), ENSURE_ASCII = False) self.file.write (content) return item def Clos E_spider (self, Spider): Self.file.write ("\ n]") self.file.close () # Download the Image Middleware Class Imagespipeline (Imagespipelin  E): Images_store = Get_project_settings (). Get ("Images_store")  def get_media_requests (self, item, info): Image_url = item["VERTICAL_SRC"] yield scrapy. Request (Image_url) def item_completed (self, results, item, info): # Fix It, get the path to the picture, and determine if the path is correct, and if so, put it to image_path , Imagespipeline source analysis is visible image_path = [x["path"] for OK, x in results if OK] os.rename (self. Images_store + "/" + image_path[0], self. Images_store + "/" + item["nickname"] + ". jpg") item["image_path"] = self. Images_store + item["nickname"] + ". jpg" Return item

 4.settings.py

#-*-Coding:utf-8-*-# scrapy settings for Douyuspider project## for simplicity, this file contains only settings consid Ered important or# commonly used. You can find more settings consulting the documentation:## http://doc.scrapy.org/en/latest/topics/settings.html# H ttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# http://scrapy.readthedocs.org/en/ Latest/topics/spider-middleware.htmlbot_name = ' Douyuspider ' spider_modules = [' Douyuspider.spiders ']NEWSPIDER_ MODULE = ' douyuspider.spiders ' # Crawl responsibly by identifying yourself (and your website) on the User-agent#user_agent = ' Douyuspider (+http://www.yourdomain.com) ' # Obey robots.txt Rulesrobotstxt_obey = true# Configure Maximum concurrent RE Quests performed by Scrapy (default:16) #CONCURRENT_REQUESTS = 32# Configure A delay for requests for the same website (DE fault:0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also Autothrottle Settings and Docs#downLoad_delay = 3# The download DELAY setting would honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16#concurrent_requests _PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = false# Disable Telnet Console (enabled by default) #TE lnetconsole_enabled = false# Override the default request Headers:default_request_headers = {' Accept ': ' Text/html,appli cation/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ', ' user-agent ': ' dyzb/2.290 (iPhone; IOS 9.3.4; scale/2.00) ' # ' accept-language ': ' en ',}# Enable or disable spider middlewares# See Http://scrapy.readthedocs.org/en/lat Est/topics/spider-middleware.html#spider_middlewares = {# ' DouyuSpider.middlewares.DouyuspiderSpiderMiddleware ': 543,#}# Enable or disable downloader middlewares# see http://scrapy.readthedocs.org/en/latest/topics/ Downloader-middleware.html#downloader_middlewares = {# ' DouyuSpider.middlewares.MyCustomDownloaderMiddleware ': 543 , #}# Enable or disable extensions# see http://scrapy.readthedocs.org/en/latest/topics/extensions.html#extensions = {# ' scrapy.extensions.telnet.TelnetConsole ': none,#}# Configure item pipelines# See Http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {' DouyuSpider.pipelines.DouyuspiderPipeline ': ' douyuSpider.pipelines.ImagesPipeline ': 200,}# Images storage location, It will then call Images_store = "Images\\" # Enable and configure the autothrottle extension (disabled by default) in pipelines.py # see H ttp://doc.scrapy.org/en/latest/topics/autothrottle.html#autothrottle_enabled = True# the initial download delay# Autothrottle_start_delay = AA The maximum download DELAY to BES set in case of high latencies#autothrottle_max_delay = 60#  The average number of requests scrapy should be sending in parallel to# each remote server#autothrottle_target_concurrency  = 1.0# enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = false# Enable and configure HTTP Caching (disabled by default) # See Http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#httpcache_enabled = True#HTTPCACHE_EXPIRATION_SECS = 0#httpcache_dir = ' HttpCache ' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = ' Scrapy.extensions.httpcache.FilesystemCacheStorage '

Crawler--scrapy Frame case one: mobile app grab Bag

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.