This two-day self-taught python to write a crawler, summed up:
Development Purpose : crawl Baidu Encyclopedia Python entry page of 1000 pages
Design ideas:
1, Learn about the Simple crawler architecture:
2 , the dynamic execution process:
3 , the realization of the parts:
URL Manager: Python memory
Web Downloader: Python3-urllib Module
Web parser: Using third-party plug-in BeautifulSoup
4 , development ideas:
Entry page:http://baike.baidu.com/view/21087.htm
URL format:
Entry Page url:/view/125370.htm
Data format:
Title: <dd class= "lemma wgt-lemmatitle-title" >
Introduction: <div class= "Lemma-summary" >***</div>
Page ID: UTF-8
5 , the development process:
① first build a Pydev project, create a new Web_spider library, create a new main function Spider_main, call in the main function: URL Manager: Url_manager, Web Downloader: Html_downloader, Web parser: Html_ Paeser., Web output: Html_outputer to complete the corresponding function
Establishment of the ②url Manager:
Creates a new read and unread URL collection to hold the corresponding URL address, defining three functions: Get_new_url (self), add_new_urls (Self,urls), Add_new_url (Self,url) to complete the corresponding function
③ Web Downloader setup:
How to download Web pages:
Import Urllib.request
Response=urllib.request.urlopen (URL)
If Response.getcode ()!=200:
Return None
Return Response.read (). Decode (' UTF-8 ')
④ Web parser creation:
Import the BeautifulSoup module,
Create a BeautifulSoup object, call the Find method to search for and access the node
⑤ implementation of Web page output:
Writes the crawled data to an HTML page
6 , the problems encountered and the solution:
Preparatory work:
First, 1,beautifulsoup:https://www.crummy.com/software/BeautifulSoup/
2. After the download is complete, it needs to be decompressed, assuming it is placed under D:/python.
3. Run cmd, switch to the D:/PYTHON/BEAUTIFULSOUP4-4.1.3/directory (modified according to your own extracted directory and the downloaded version number),
4. Run the command:
setup.py Build
setup.py Install
5. Under IDE from BS4 import BeautifulSoup, there is no error stating that the installation was successful.
Second, after installing the Pydev plug-in in Ecplise, but can not build the project, later found that there is no configuration Python3 interpreter, the specific steps are: In the Eclipse menu bar, choose Window > Preferences > Pydev > Interpreter-(Python/jython), configuring the Python/jython interpreter here
The instance code is as follows :
( 1 ) spider_main.py
# Coding:utf8
From Baike_spider import Url_manager, Html_downloader, html_outputer,\
Html_parser
Class Spidermain (object):
def __init__ (self):
Self.urls=url_manager. Urlmanager ()
Self.downloader=html_downloader. Htmldownloader ()
Self.parser=html_parser. Htmlparser ()
Self.outputer=html_outputer. Htmloutputer ()
Def craw (Self, root_url):
Count=1
Self.urls.add_new_url (Root_url)
While Self.urls.has_new_url ():
Try
New_url=self.urls.get_new_url ()
Print ("Craw%d:%s"% (Count,new_url))
Html_cont=self.downloader.download (New_url)
New_urls,new_data=self.parser.parse (New_url,html_cont)
Self.urls.add_new_urls (New_urls)
Self.outputer.collect_data (New_data)
If count==1000:
Break
Count=count+1
Except
Print ("Craw failed")
Self.outputer.output_html ()
If __name__== "__main__":
Root_url= "Http://baike.baidu.com/view/21087.htm"
Obj_spider=spidermain ()
Obj_spider.craw (Root_url)
(2) url_manager.py
# Coding:utf8
Class Urlmanager (object):
def __init__ (self):
Self.new_urls=set ()
Self.old_urls=set ()
def has_new_url (self):
Return Len (self.new_urls)!=0
def get_new_url (self):
New_url=self.new_urls.pop ()
Self.old_urls.add (New_url)
Return New_url
def add_new_urls (Self,urls):
If URLs is None or len (URLs) ==0:
Return
For URL in URLs:
Self.add_new_url (URL)
def add_new_url (Self,url):
If URL is None:
Return
If URL not in Self.new_urls and URL not in Self.old_urls:
Self.new_urls.add (URL)
(3) html_downloader.py
# Coding:utf8
Import Urllib.request
Class Htmldownloader (object):
def download (self,url):
If URL is None:
Return None
Response=urllib.request.urlopen (URL)
If Response.getcode ()!=200:
Return None
Return Response.read (). Decode (' UTF-8 ')
(4) html_paeser.py
# Coding:utf8
From BS4 import BeautifulSoup
Import re
Import Urllib
Class Htmlparser (object):
def _get_new_urls (self, Page_url, soup):
New_urls=set ()
Links=soup.find_all (' A ', Href=re.compile (r "/view/\d+\.htm"))
For link in Links:
new_url=link[' href ']
New_full_url=urllib.parse.urljoin (Page_url,new_url)
New_urls.add (New_full_url)
Return New_urls
def _get_new_data (self, Page_url, soup):
#<dd class= "Lemmawgt-lemmatitle-title" >
res_data={}
res_data[' URL ']=page_url
Title_node=soup.find (' DD ', class_= "Lemmawgt-lemmatitle-title"). Find ("H1")
res_data[' title ']=title_node.get_text ()
#<div class= "lemma-summary" label-module= "Lemmasummary" >
Summary_node=soup.find (' div ', class_= "Lemma-summary")
res_data[' summary ']=summary_node.get_text ()
Return Res_data
Def parse (Self,page_url,html_cont):
If Page_url is None or Html_cont is none:
Return None
Soup=beautifulsoup (Html_cont, ' Html.parser ', from_encoding= ' UTF8 ')
New_urls=self._get_new_urls (Page_url,soup)
New_data=self._get_new_data (Page_url,soup)
Return New_urls,new_data
(5) html_outputer.py
# Coding:utf8
Class Htmloutputer (object):
def __init__ (self):
Self.datas=[]
def collect_data (Self,data):
If data is None:
Return
Self.datas.append (data)
def output_html (self):
Fout=open ("output.html", ' W ', encoding= ' UTF8 ')
Fout.write ("
Fout.write ("<body>")
Fout.write ("<table>")
For data in Self.datas:
Fout.write ("<tr>")
Fout.write ("<td>%s</td>"%str (data[' url '))
Fout.write ("<td>%s</td>"%str (data[' title '))
Fout.write ("<td>%s</td>"%str (data[' summary '))
Fout.write ("</tr>")
Fout.write ("</table>")
Fout.write ("</body>")
Fout.write ("
Fout.close ()
Run results
Python Development Lightweight Crawler