Crawler example getting started and crawler example getting started
Objective: To Crawl 100 Python web pages from Baidu encyclopedia
Tool environment: Python 3.5, Sublime Text 3
Crawler Scheduler: spider_main.py
# Coding: utf8 # from baike_spider import url_manager, html_downloader, html_parser, \ # html_outputerimport url_manager, html_downloader, html_parser, \ html_outputerimport socketclass SpiderMain (object ): # constructor def _ init _ (self): # url manager self. urls = url_manager.UrlManager () # download tool self. downloader = html_downloader.HtmlDownloader () # parse along with self. parser = html_parser.HtmlParser () # output machine self. outputer = html_o Utputer. htmlOutputer () self. log_datas = [] # log list # crawler scheduler def craw (self, root_url): count = 1 self. urls. add_new_url (root_url) while self. urls. has_new_url (): try: new_url = self. urls. get_new_url () print ('craw % d: % s' % (count, new_url) html_cont = self. downloader. download (new_url) # Here the download is not moving or timeout, and the program is stuck here. new_urls, new_data = self. parser. parse (new_url, html_cont) self. urls. add_new_urls (new_urls) sel F. outputer. collect_data (new_data) # logs self. collect_log ('craw % d: % s' % (count, new_url) if count = 100: break count = count + 1 tb t: print ('craw failed! ') Self. outputer. output_html () # logs self. write_log () # collection log def collect_log (self, data): if data is None: return self. log_datas.append (data) # Write log def write_log (self): fout = open('output.txt ', 'w', encoding = 'utf-8') # output object w write mode, specify the file encoding format # fout. write (str (self. log_datas) for data in self. log_datas: # fout. write (str (data) fout. write (data + '\ r \ n') fout. close () if _ name __= = "_ main _": socket. setdefatimetimeout (5) # set the global timeout to 5 s root_url = "http://baike.baidu.com/item/Python" # root_url = r "http://baike.baidu.com/item/%E9%BB%91%E6%B2%B3%E5%AD%A6%E9%99%A2" obj_spider = SpiderMain () print ('spider starting .... ') obj_spider.craw (root_url)
Url MANAGER: url_manager.py
# Coding: utf8 # url manager class UrlManager (object): def _ init _ (self): self. new_urls = set () self. old_urls = set () print ('init url manager') # Add a new url def add_new_url (self, url) to the manager: if url is None: return if url not in self. new_urls and url not in self. old_urls: self. new_urls.add (url) # add multiple new urls def add_new_urls (self, urls) to the Manager: if urls is None or len (urls) = 0: return for url in urls: self. add_ne W_url (url) # Call the preceding method add_new_url to add a url # search for a url def has_new_url (self): return len (self. new_urls) in the Manager )! = 0 # Get a url in the manager. After obtaining the url, delete it in the new url and save it in the old url def get_new_url (self): new_url = self. new_urls.pop () # Find and delete self. old_urls.add (new_url) return new_url
Download tool: html_downloader.py
# Coding: utf8 # import urllibimport urllib. request # loader class HtmlDownloader (object): def _ init _ (self): print ('init html downloader') def download (self, url): if url is None: return None # This method is easy to get stuck or the download times out. The solution is to add the global timeout settings in the main portal of the crawler: socket. setdefatimetimeout (10) # set the global timeout value to 10 s # Save time response = urllib compared to the following method. request. urlopen (url) # The following is another method to prevent freezing or download Timeout: # response = urllib. request. urlopen (url, timeout = 3) if response. getcode ()! = 200: return None return response. read ()
Parser: html_parser.py
# Coding: utf8from bs4 import BeautifulSoupimport re # import urlparseimport urllib. parse # parser class HtmlParser (object): def _ init _ (self): print ('init html parser ') def _ get_new_urls (self, page_url, soup ): #/view/123.htm new_urls = set () # links = soup. find_all ('A', href = re. compile (r "/view/\ d + \. htm ") links = soup. find_all ('A', href = re. compile (r "/item/") for link in links: new_url = link ['href '] # new_full_url = urlparse. urljoin (page_url, new_url) new_full_url = urllib. parse. urljoin (page_url, new_url) new_urls.add (new_full_url) return new_urls def _ get_new_data (self, page_url, soup ): # define a dictionary res_data ={}# url res_data ['url'] = page_url # <dd class = "lemmaWgt-lemmaTitle-title">
Output: html_outputer.py
# Coding: utf8 # output class HtmlOutputer (object): # constructor def _ init _ (self): self. datas = [] # list print ('init html outputer ') def collect_data (self, data): if data is None: return self. datas. append (data) def output_html (self): # fout = open('output.html ', 'w') # file output object w write mode fout = open('output.html', 'w ', encoding = 'utf-8') # Write mode of the output object W, and specify the file encoding format fout. write ("
Final effect:
Summary:
Learn more about Python crawlers and explore the unknown!