Python crawl bole Online full version

Source: Internet
Author: User
Tags mysql insert

In jobbole.py

Import scrapyimport reimport datetimefrom urllib import parsefrom scrapy.http import requestfrom articlespider.items impo RT jobbolearticleitemfrom ArticleSpider.utils.common Import get_md5from scrapy.loader import Itemloaderclass Jobbolespider (scrapy. Spider): name = ' Jobbole ' allowed_domains = [' blog.jobbole.com '] start_urls = [' http://blog.jobbole.com/all-posts Def parse (self, Response): "" "1. Get the article URL in the articles list page and submit it to scrapy after downloading and parsing 2. Get the URL of the next page and give it to scrapy to download, after the download is done to parse "" "# Parse all the article URLs in the list page and give scrapy to download and parse Post_nodes = Respo Nse.css ("#archive. Floated-thumb. Post-thumb a") for Post_node in Post_nodes: #获取封面图的url imag E_url = Post_node.css ("img::attr (SRC)"). Extract_first ("") Post_url = Post_node.css (":: attr (HREF)"). Extract_firs            T ("") #request下载完成之后, callback Parse_detail to parse the article details page # Request (url=post_url,callback=self.parse_detail) Yield Request (url=pArse.urljoin (Response.url,post_url), meta={"Front_image_url": Image_url},callback=self.parse_detail) #遇到href没有域名的 Solution #response. url + post_url # Extract the next page and give scrapy to download Next_url = Response.css (". next.page-numbers::a TTR (HREF) "). Extract_first (" ") If Next_url:yield Request (Url=parse.urljoin (Response.url, Next_url), call  Back=self.parse) def parse_detail (self, Response): # Front_image_url = Response.meta.get ("Front_image_url", "" ") #文章封面图 # title = Response.css (". Entry-header h1::text"). Extract_first () # create_date = Response.css ("P.ent Ry-meta-hide-on-mobile::text "). Extract () [0].strip (). replace (" • "," "). Strip () # praise_nums = Response.css (". Vote-po St-up H10::text "). Extract () [0] # fav_nums = Response.css (". Bookmark-btn::text "). Extract () [0] # match_re = RE . Match (". *?" ( \d+). * ", fav_nums) # if match_re: # fav_nums = Int (Match_re.group (1)) # Else: # Fav_nu       ms = 0 # # comment_nums = response.css ("a[href= ' #article-comment '] span::text"). Extract () [0] # match_re = Re.match (".*?     (\d+). * ", comment_nums) # if match_re: # comment_nums = Int (Match_re.group (1)) # Else: # comment_nums = 0 # # # content = response.css ("Div.entry::text"). Extract () # content = Response.cs        S (' Div.entry '). Extract_first () # # tag_list = Response.css ("P.entry-meta-hide-on-mobile a::text"). Extract () # tag_list = [element for element in Tag_list if not Element.strip (). EndsWith ("comment")] # tags = ",". Join (tag_l IST) # try: # create_date = Datetime.datetime.strptime (create_date, "%y/%m/%d"). Date () # except E        Xception as e: # create_date = Datetime.datetime.now (). Date () # Article_item = Jobbolearticleitem () # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = Crea Te_date # ARticle_item["Front_image_url"] = [Front_image_url] # article_item["praise_nums"] = praise_nums # Article_item  ["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags #  article_item["Content"] = content # article_item["url_object_id"] = GET_MD5 (response.url) Front_image_url = Response.meta.get ("Front_image_url", "") # Article cover Map item_loader = Itemloader (Item=jobbolearticleitem (), Response=res        Ponse) item_loader.add_css ("title", ". Entry-header h1::text") item_loader.add_value ("url", Response.url) Item_loader.add_value ("url_object_id", Get_md5 (Response.url)) item_loader.add_css ("Create_date", "p.entry-meta- Hide-on-mobile::text ") Item_loader.add_value (" Front_image_url ", [Front_image_url]) item_loader.add_css (" Prai        Se_nums ",". vote-post-up h10::text ") item_loader.add_css (" Comment_nums "," a[href= ' #article-comment '] span::text ") Item_loader.add_csS ("Fav_nums", ". Bookmark-btn::text") item_loader.add_css ("tags", "p.entry-meta-hide-on-mobile a::text") Item        _loader.add_css ("Content", "Div.entry") #调用这个方法来对规则进行解析生成item对象 Article_item = Item_loader.load_item () Yield Article_item

2.main.py

From scrapy.cmdline import executeimport sysimport ossys.path.append (Os.path.dirname (Os.path.abspath (__file__))) Execute (["Scrapy", "Crawl", "Jobbole"])

3.item.py

Import scrapyimport datetimeimport refrom scrapy.loader import itemloaderfrom scrapy.loader.processors Import Mapcompose, Takefirst, Joinclass Articlespideritem (scrapy. Item): # define the fields for your item here is like: # name = Scrapy.    Field () passdef Date_convert (value): Try:create_date = Datetime.datetime.strptime (value, "%y/%m/%d"). Date ()    Except Exception as E:create_date = Datetime.datetime.now (). Date () return create_datedef get_nums (value): Match_re = Re.match (". *?" ( \d+). * ", value) if match_re:nums = Int (Match_re.group (1)) Else:nums = 0 return numsdef remove_com Ment_tags (value): #去掉tag中提取的评论 if "comment" in Value:return "" Else:return valuedef return_value (value ): Return value# class Jobbolearticleitem (scrapy. Item): # title = Scrapy. Field () # create_date = Scrapy. Field () # URL = scrapy. Field () # url_object_id = Scrapy. Field () # Front_image_url = Scrapy. Field () # Front_image_path = Scrapy. Field () # praise_nums = Scrapy. Field () # comment_nums = Scrapy. Field () # fav_nums = Scrapy. Field () # content = Scrapy. Field () # tags = scrapy. Field () class Articleitemloader (Itemloader): #自定义itemloader default_output_processor = Takefirst () class Jobbolearticl Eitem (scrapy. Item): title = Scrapy. Field () Create_date = Scrapy. Field (Input_processor=mapcompose (date_convert),) URL = scrapy. Field () url_object_id = Scrapy. Field () Front_image_url = Scrapy. Field (Output_processor=mapcompose (return_value)) Front_image_path = Scrapy. Field () Praise_nums = Scrapy. Field (Input_processor=mapcompose (get_nums)) Comment_nums = Scrapy. Field (Input_processor=mapcompose (get_nums)) Fav_nums = Scrapy. Field (Input_processor=mapcompose (get_nums)) #因为tag本身是list, so to rewrite tags = scrapy. Field (Input_processor=mapcompose (remove_comment_tags), Output_processor=join (",")) content = ScrapY.field () 

4.pipelines.py

Import codecsimport jsonimport mysqldbimport mysqldb.cursorsfrom twisted.enterprise Import Adbapifrom Scrapy.pipelines.images Import imagespipelinefrom scrapy.exporters import Jsonitemexporterclass Articlespiderpipeline (object): Def process_item (self, item, spider): Return itemclass articleimagepipeline (Image            Spipeline): #重写该方法可从result中获取到图片的实际 def item_completed (self, results, item, info): For OK, value in results: Image_file_path = value["path"] item["front_image_path"] = Image_file_path return itemclass Mysql Twistedpipline (object): Def __init__ (self, dbpool): Self.dbpool = Dbpool @classmethod def from_settings (cl            s, settings): Dbparms = dict (host = settings["Mysql_host"], db = settings["Mysql_dbname"],            user = settings["Mysql_user"], passwd = settings["Mysql_password"], charset= ' UTF8 ', Cursorclass=mysqldb.cursors.dictcursor, USE_UNICODE=TRUE,) #**dbparms--> ("MySQLdb", host=settings[' mysql_host '] Dbpool = Adbapi. ConnectionPool ("MySQLdb", **dbparms) return CLS (Dbpool) def process_item (self, item, spider): #使用twisted将 MySQL insert becomes asynchronous Execute query = Self.dbpool.runInteraction (Self.do_insert, item) query.adderrback (Self.handle_error, I TEM, Spider) #处理异常 def handle_error (self, failure, item, spider): #处理异步插入的异常 print (failure) def do_in SERT (self, cursor, item): #执行具体的插入 #根据不同的item build different SQL statements and insert them into MySQL insert_sql, params = Item.get_inse Rt_sql () Cursor.execute (Insert_sql, params) class Jsonwithencodingpipeline (object): # Custom JSON file export def __init_        _ (Self): # Use codecs to open to avoid some coding problems. Self.file = Codecs.open (' Article.json ', ' W ', encoding= "Utf-8") def process_item (self, item, spider): # Convert Item to D ICT, and then call the dumps method to generate the JSON object, False to avoid chinese error lines = json.dumps (item), ensure_ascii=false) + "\ n" self.file.writ E (lINES) return item # When the spider is closed: This is a spider_closed semaphore. def spider_closed (self, Spider): Self.file.close () class Jsonexporterpipeline (object): Export #调用scrapy提供的json Exports J Son file Def __init__ (self): self.file = open (' Articleexport.json ', ' WB ') Self.exporter = Jsonitemexporter (s         Elf.file, encoding= "Utf-8", Ensure_ascii=false) self.exporter.start_exporting () def close_spider (self, spider): Self.exporter.finish_exporting () self.file.close () def process_item (self, item, spider): Self.exp Orter.export_item (item) return Itemclass Mysqlpipeline (object): #采用同步的机制写入mysql def __init__ (self): SE Lf.conn = MySQLdb.connect (' localhost ', ' root ', ' 123456 ', ' Article_spider ', charset= ' UTF8 ', use_unicode=true) SELF.C Ursor = Self.conn.cursor () def process_item (self, item, spider): Insert_sql = ' "" INSERT into Jobbole _article (title,create_date,url,url_object_id, Front_image_url,praise_nums,comment_nums,fav_nums,tags,content) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) "" "Self.cursor.exec Ute (Insert_sql, (item["title"],item["create_date"],item["url"],item["url_object_id"],item["Front_image_url"], item["Praise_nums"],item["Comment_nums"],item["fav_nums"],item["tags"],item["content"]) Self.conn.commit ()

5.setting.py

Import osbot_name = ' articlespider ' spider_modules = [' articlespider.spiders ']newspider_module = ' Articlespider.spiders ' # Crawl responsibly by identifying yourself (and your website) on the user-agent#user_agent = ' Artic Lespider (+http://www.yourdomain.com) ' # Obey robots.txt Rulesrobotstxt_obey = false# Configure Maximum concurrent Requests performed by Scrapy (default:16) #CONCURRENT_REQUESTS = 32# Configure A delay for requests for the same website ( default:0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and Docs#download_delay = 3# the DOWNLOAD DELAY setting would honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16#concur RENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default) #COOKIES_ENABLED = false# Disable Telnet Console (enabled B Y default) #TELNETCONSOLE_ENABLED = false# Override The default request headers: #DEFAULT_REQUEST_HEADERS = {# ' Accept ': ' Text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ', # ' accept-language ': ' en ', #}# Enable or disable spider middlewares# See HTTPS://DOC.SCRAPY.ORG/EN/LATEST/TOPICS/SPI Der-middleware.html#spider_middlewares = {# ' ArticleSpider.middlewares.ArticlespiderSpiderMiddleware ': 543,#}# Enable or disable downloader middlewares# see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html# Downloader_middlewares = {# ' ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware ': 543,#}# Enable or disable E xtensions# See Https://doc.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# ' Scrapy.extensions.telnet.TelnetConsole ': none,#}# Configure item pipelines# See https://doc.scrapy.org/en/latest/ Topics/item-pipeline.htmlitem_pipelines = {' ArticleSpider.pipelines.ArticlespiderPipeline ': +, # ' Scrapy.pipeline S.images.imagespipeline ': 1, # ' ArticleSpider.pipelines.ArticleImagePipeline ': 1, ' ArticleSpider.pipelines.JsonExporterPipeline ': 2, # ' ArticleSpider.pipelines.MysqlPipeline ': 4,}images_urls_field ="Front_image_url" Project_dir = Os.path.abspath (Os.path.dirname (__file__)) Images_store = Os.path.join (Project_dir, ' Images ') # Enable and configure the autothrottle extension (disabled by default) # See Https://doc.scrapy.org/en/latest/top ics/autothrottle.html#autothrottle_enabled = true# the initial download delay#autothrottle_start_delay = AA the maximum D Ownload delay to BES set in case of high latencies#autothrottle_max_delay = 60# The average number of requests scrapy Shoul  d be sending in parallel to# each remote Server#autothrottle_target_concurrency = 1.0# Enable showing throttling stats for Every response Received: #AUTOTHROTTLE_DEBUG = false# Enable and configure HTTP caching (disabled by default) # See https:/ /doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#httpcache_enabled = True#httpcache_expiration_secs = 0#httpcache_dir = ' HttpCache ' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = ' Scrapy.extensions.httpcache.FilesystemCacheStorage' Mysql_host = ' localhost ' mysql_dbname = ' article_spider ' mysql_user = ' root ' Mysql_password = ' 123456 ' 

  

  

  

 

Python crawl bole Online full version

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.