Crawler example getting started and crawler example getting started

Source: Internet
Author: User

Crawler example getting started and crawler example getting started

Objective: To Crawl 100 Python web pages from Baidu encyclopedia

 

Tool environment: Python 3.5, Sublime Text 3

 

Crawler Scheduler: spider_main.py

# Coding: utf8 # from baike_spider import url_manager, html_downloader, html_parser, \ # html_outputerimport url_manager, html_downloader, html_parser, \ html_outputerimport socketclass SpiderMain (object ): # constructor def _ init _ (self): # url manager self. urls = url_manager.UrlManager () # download tool self. downloader = html_downloader.HtmlDownloader () # parse along with self. parser = html_parser.HtmlParser () # output machine self. outputer = html_o Utputer. htmlOutputer () self. log_datas = [] # log list # crawler scheduler def craw (self, root_url): count = 1 self. urls. add_new_url (root_url) while self. urls. has_new_url (): try: new_url = self. urls. get_new_url () print ('craw % d: % s' % (count, new_url) html_cont = self. downloader. download (new_url) # Here the download is not moving or timeout, and the program is stuck here. new_urls, new_data = self. parser. parse (new_url, html_cont) self. urls. add_new_urls (new_urls) sel F. outputer. collect_data (new_data) # logs self. collect_log ('craw % d: % s' % (count, new_url) if count = 100: break count = count + 1 tb t: print ('craw failed! ') Self. outputer. output_html () # logs self. write_log () # collection log def collect_log (self, data): if data is None: return self. log_datas.append (data) # Write log def write_log (self): fout = open('output.txt ', 'w', encoding = 'utf-8') # output object w write mode, specify the file encoding format # fout. write (str (self. log_datas) for data in self. log_datas: # fout. write (str (data) fout. write (data + '\ r \ n') fout. close () if _ name __= = "_ main _": socket. setdefatimetimeout (5) # set the global timeout to 5 s root_url = "http://baike.baidu.com/item/Python" # root_url = r "http://baike.baidu.com/item/%E9%BB%91%E6%B2%B3%E5%AD%A6%E9%99%A2" obj_spider = SpiderMain () print ('spider starting .... ') obj_spider.craw (root_url)

 

 

Url MANAGER: url_manager.py

# Coding: utf8 # url manager class UrlManager (object): def _ init _ (self): self. new_urls = set () self. old_urls = set () print ('init url manager') # Add a new url def add_new_url (self, url) to the manager: if url is None: return if url not in self. new_urls and url not in self. old_urls: self. new_urls.add (url) # add multiple new urls def add_new_urls (self, urls) to the Manager: if urls is None or len (urls) = 0: return for url in urls: self. add_ne W_url (url) # Call the preceding method add_new_url to add a url # search for a url def has_new_url (self): return len (self. new_urls) in the Manager )! = 0 # Get a url in the manager. After obtaining the url, delete it in the new url and save it in the old url def get_new_url (self): new_url = self. new_urls.pop () # Find and delete self. old_urls.add (new_url) return new_url

 

 

Download tool: html_downloader.py

# Coding: utf8 # import urllibimport urllib. request # loader class HtmlDownloader (object): def _ init _ (self): print ('init html downloader') def download (self, url): if url is None: return None # This method is easy to get stuck or the download times out. The solution is to add the global timeout settings in the main portal of the crawler: socket. setdefatimetimeout (10) # set the global timeout value to 10 s # Save time response = urllib compared to the following method. request. urlopen (url) # The following is another method to prevent freezing or download Timeout: # response = urllib. request. urlopen (url, timeout = 3) if response. getcode ()! = 200: return None return response. read ()

 

 

Parser: html_parser.py

# Coding: utf8from bs4 import BeautifulSoupimport re # import urlparseimport urllib. parse # parser class HtmlParser (object): def _ init _ (self): print ('init html parser ') def _ get_new_urls (self, page_url, soup ): #/view/123.htm new_urls = set () # links = soup. find_all ('A', href = re. compile (r "/view/\ d + \. htm ") links = soup. find_all ('A', href = re. compile (r "/item/") for link in links: new_url = link ['href '] # new_full_url = urlparse. urljoin (page_url, new_url) new_full_url = urllib. parse. urljoin (page_url, new_url) new_urls.add (new_full_url) return new_urls def _ get_new_data (self, page_url, soup ): # define a dictionary res_data ={}# url res_data ['url'] = page_url # <dd class = "lemmaWgt-lemmaTitle-title"> 

 

 

Output: html_outputer.py

# Coding: utf8 # output class HtmlOutputer (object): # constructor def _ init _ (self): self. datas = [] # list print ('init html outputer ') def collect_data (self, data): if data is None: return self. datas. append (data) def output_html (self): # fout = open('output.html ', 'w') # file output object w write mode fout = open('output.html', 'w ', encoding = 'utf-8') # Write mode of the output object W, and specify the file encoding format fout. write ("

 

 

Final effect:

 

 

Summary:

Learn more about Python crawlers and explore the unknown!

 

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.