Crawler example getting started and crawler example getting started

Last Update:2017-05-04 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Objective: To Crawl 100 Python web pages from Baidu encyclopedia

Tool environment: Python 3.5, Sublime Text 3

Crawler Scheduler: spider_main.py

# Coding: utf8 # from baike_spider import url_manager, html_downloader, html_parser, \ # html_outputerimport url_manager, html_downloader, html_parser, \ html_outputerimport socketclass SpiderMain (object ): # constructor def _ init _ (self): # url manager self. urls = url_manager.UrlManager () # download tool self. downloader = html_downloader.HtmlDownloader () # parse along with self. parser = html_parser.HtmlParser () # output machine self. outputer = html_o Utputer. htmlOutputer () self. log_datas = [] # log list # crawler scheduler def craw (self, root_url): count = 1 self. urls. add_new_url (root_url) while self. urls. has_new_url (): try: new_url = self. urls. get_new_url () print ('craw % d: % s' % (count, new_url) html_cont = self. downloader. download (new_url) # Here the download is not moving or timeout, and the program is stuck here. new_urls, new_data = self. parser. parse (new_url, html_cont) self. urls. add_new_urls (new_urls) sel F. outputer. collect_data (new_data) # logs self. collect_log ('craw % d: % s' % (count, new_url) if count = 100: break count = count + 1 tb t: print ('craw failed! ') Self. outputer. output_html () # logs self. write_log () # collection log def collect_log (self, data): if data is None: return self. log_datas.append (data) # Write log def write_log (self): fout = open('output.txt ', 'w', encoding = 'utf-8') # output object w write mode, specify the file encoding format # fout. write (str (self. log_datas) for data in self. log_datas: # fout. write (str (data) fout. write (data + '\ r \ n') fout. close () if _ name __= = "_ main _": socket. setdefatimetimeout (5) # set the global timeout to 5 s root_url = "http://baike.baidu.com/item/Python" # root_url = r "http://baike.baidu.com/item/%E9%BB%91%E6%B2%B3%E5%AD%A6%E9%99%A2" obj_spider = SpiderMain () print ('spider starting .... ') obj_spider.craw (root_url)

Url MANAGER: url_manager.py

# Coding: utf8 # url manager class UrlManager (object): def _ init _ (self): self. new_urls = set () self. old_urls = set () print ('init url manager') # Add a new url def add_new_url (self, url) to the manager: if url is None: return if url not in self. new_urls and url not in self. old_urls: self. new_urls.add (url) # add multiple new urls def add_new_urls (self, urls) to the Manager: if urls is None or len (urls) = 0: return for url in urls: self. add_ne W_url (url) # Call the preceding method add_new_url to add a url # search for a url def has_new_url (self): return len (self. new_urls) in the Manager )! = 0 # Get a url in the manager. After obtaining the url, delete it in the new url and save it in the old url def get_new_url (self): new_url = self. new_urls.pop () # Find and delete self. old_urls.add (new_url) return new_url

Download tool: html_downloader.py

# Coding: utf8 # import urllibimport urllib. request # loader class HtmlDownloader (object): def _ init _ (self): print ('init html downloader') def download (self, url): if url is None: return None # This method is easy to get stuck or the download times out. The solution is to add the global timeout settings in the main portal of the crawler: socket. setdefatimetimeout (10) # set the global timeout value to 10 s # Save time response = urllib compared to the following method. request. urlopen (url) # The following is another method to prevent freezing or download Timeout: # response = urllib. request. urlopen (url, timeout = 3) if response. getcode ()! = 200: return None return response. read ()

Parser: html_parser.py

# Coding: utf8from bs4 import BeautifulSoupimport re # import urlparseimport urllib. parse # parser class HtmlParser (object): def _ init _ (self): print ('init html parser ') def _ get_new_urls (self, page_url, soup ): #/view/123.htm new_urls = set () # links = soup. find_all ('A', href = re. compile (r "/view/\ d + \. htm ") links = soup. find_all ('A', href = re. compile (r "/item/") for link in links: new_url = link ['href '] # new_full_url = urlparse. urljoin (page_url, new_url) new_full_url = urllib. parse. urljoin (page_url, new_url) new_urls.add (new_full_url) return new_urls def _ get_new_data (self, page_url, soup ): # define a dictionary res_data ={}# url res_data ['url'] = page_url # <dd class = "lemmaWgt-lemmaTitle-title"> 
 
 
Output: html_outputer.py
# Coding: utf8 # output class HtmlOutputer (object): # constructor def _ init _ (self): self. datas = [] # list print ('init html outputer ') def collect_data (self, data): if data is None: return self. datas. append (data) def output_html (self): # fout = open('output.html ', 'w') # file output object w write mode fout = open('output.html', 'w ', encoding = 'utf-8') # Write mode of the output object W, and specify the file encoding format fout. write ("
 
 
Final effect:

 
 
Summary:
Learn more about Python crawlers and explore the unknown!

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

Crawler example getting started and crawler example getting started

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

Crawler example getting started and crawler example getting started

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support