This two-day self-taught python to write a crawler, summed up:

Development Purpose : crawl Baidu Encyclopedia Python entry page of 1000 pages

Design ideas:

1, Learn about the Simple crawler architecture:

2 , the dynamic execution process:

3 , the realization of the parts:

URL Manager: Python memory

Web Downloader: Python3-urllib Module

Web parser: Using third-party plug-in BeautifulSoup

4 , development ideas:

Entry page:

URL format:

Entry Page url:/view/125370.htm

Data format:

Title: <dd class= "lemma wgt-lemmatitle-title" >

Introduction: <div class= "Lemma-summary" >***</div>

Page ID: UTF-8

5 , the development process:

① first build a Pydev project, create a new Web_spider library, create a new main function Spider_main, call in the main function: URL Manager: Url_manager, Web Downloader: Html_downloader, Web parser: Html_ Paeser., Web output: Html_outputer to complete the corresponding function

Establishment of the ②url Manager:

Creates a new read and unread URL collection to hold the corresponding URL address, defining three functions: Get_new_url (self), add_new_urls (Self,urls), Add_new_url (Self,url) to complete the corresponding function

③ Web Downloader setup:

How to download Web pages:

Import Urllib.request

Response=urllib.request.urlopen (URL)

If Response.getcode ()!=200:

Return None

Return (). Decode (' UTF-8 ')

④ Web parser creation:

Import the BeautifulSoup module,

Create a BeautifulSoup object, call the Find method to search for and access the node

⑤ implementation of Web page output:

Writes the crawled data to an HTML page

6 , the problems encountered and the solution:

Preparatory work:

First, 1,beautifulsoup:

2. After the download is complete, it needs to be decompressed, assuming it is placed under D:/python.

3. Run cmd, switch to the D:/PYTHON/BEAUTIFULSOUP4-4.1.3/directory (modified according to your own extracted directory and the downloaded version number),

4. Run the command: Build Install

5. Under IDE from BS4 import BeautifulSoup, there is no error stating that the installation was successful.

Second, after installing the Pydev plug-in in Ecplise, but can not build the project, later found that there is no configuration Python3 interpreter, the specific steps are: In the Eclipse menu bar, choose Window > Preferences > Pydev > Interpreter-(Python/jython), configuring the Python/jython interpreter here

The instance code is as follows :

( 1 )

# Coding:utf8

From Baike_spider import Url_manager, Html_downloader, html_outputer,\


Class Spidermain (object):

def __init__ (self):

Self.urls=url_manager. Urlmanager ()

Self.downloader=html_downloader. Htmldownloader ()

Self.parser=html_parser. Htmlparser ()

Self.outputer=html_outputer. Htmloutputer ()

Def craw (Self, root_url):


Self.urls.add_new_url (Root_url)

While Self.urls.has_new_url ():


New_url=self.urls.get_new_url ()

Print ("Craw%d:%s"% (Count,new_url)) (New_url)

New_urls,new_data=self.parser.parse (New_url,html_cont)

Self.urls.add_new_urls (New_urls)

Self.outputer.collect_data (New_data)

If count==1000:




Print ("Craw failed")

Self.outputer.output_html ()

If __name__== "__main__":

Root_url= "Http://"

Obj_spider=spidermain ()

Obj_spider.craw (Root_url)


# Coding:utf8

Class Urlmanager (object):

def __init__ (self):

Self.new_urls=set ()

Self.old_urls=set ()

def has_new_url (self):

Return Len (self.new_urls)!=0

def get_new_url (self):

New_url=self.new_urls.pop ()

Self.old_urls.add (New_url)

Return New_url

def add_new_urls (Self,urls):

If URLs is None or len (URLs) ==0:


For URL in URLs:

Self.add_new_url (URL)

def add_new_url (Self,url):

If URL is None:


If URL not in Self.new_urls and URL not in Self.old_urls:

Self.new_urls.add (URL)


# Coding:utf8

Import Urllib.request

Class Htmldownloader (object):

def download (self,url):

If URL is None:

Return None

Response=urllib.request.urlopen (URL)

If Response.getcode ()!=200:

Return None

Return (). Decode (' UTF-8 ')


# Coding:utf8

From BS4 import BeautifulSoup

Import re

Import Urllib

Class Htmlparser (object):

def _get_new_urls (self, Page_url, soup):

New_urls=set ()

Links=soup.find_all (' A ', Href=re.compile (r "/view/\d+\.htm"))

For link in Links:

new_url=link[' href ']

New_full_url=urllib.parse.urljoin (Page_url,new_url)

New_urls.add (New_full_url)

Return New_urls

def _get_new_data (self, Page_url, soup):

#<dd class= "Lemmawgt-lemmatitle-title" >


res_data[' URL ']=page_url

Title_node=soup.find (' DD ', class_= "Lemmawgt-lemmatitle-title"). Find ("H1")

res_data[' title ']=title_node.get_text ()

#<div class= "lemma-summary" label-module= "Lemmasummary" >

Summary_node=soup.find (' div ', class_= "Lemma-summary")

res_data[' summary ']=summary_node.get_text ()

Return Res_data

Def parse (Self,page_url,html_cont):

If Page_url is None or Html_cont is none:

Return None

Soup=beautifulsoup (Html_cont, ' Html.parser ', from_encoding= ' UTF8 ')

New_urls=self._get_new_urls (Page_url,soup)

New_data=self._get_new_data (Page_url,soup)

Return New_urls,new_data


# Coding:utf8

Class Htmloutputer (object):

def __init__ (self):


def collect_data (Self,data):

If data is None:


Self.datas.append (data)

def output_html (self):

Fout=open ("output.html", ' W ', encoding= ' UTF8 ')

Fout.write ("

Fout.write ("<body>")

Fout.write ("<table>")

For data in Self.datas:

Fout.write ("<tr>")

Fout.write ("<td>%s</td>"%str (data[' url '))

Fout.write ("<td>%s</td>"%str (data[' title '))

Fout.write ("<td>%s</td>"%str (data[' summary '))

Fout.write ("</tr>")

Fout.write ("</table>")

Fout.write ("</body>")

Fout.write ("

Fout.close ()

Run results

