Description: Support Chinese # CODING=GBK
Create Project: File->new->project->pydev->pydev Project
New project name: arbitrary. New Package Name: Wangyepachong
Create a 5 module file:
All modules used in the project need to be added in Eclipse: Windows->preferences->pydev->interpreters->python interpreter find forced Add the modules that need to be imported in Builtins:
For example: BS4 urllib2 Re, etc.
html_dowanloader.py
html_outputer.py
html_parser.py
spider_main.py
url_manager.py
First file:spider_main.py
# CODING=GBK
‘‘‘
Simple crawler Architecture
URL Manager
Web Downloader, Urllib2
Web parser: BeautifulSoup
This is just the simplest crawler!!!
--late, need to login, verification code, AJAX, server anti-crawler, multi-threading, distributed
‘‘‘
Import __main__
From Wangyepachong import Url_manager
From Wangyepachong import Html_downloader
From Wangyepachong import Html_parser
From Wangyepachong import Html_outputer
Class Spidermain(object):
def __init__ (self):
Self.urls = Url_manager. Urlmanager ()
Self.downloader = Html_downloader. Htmldownloader ()
Self.parser = Html_parser. Htmlparser ()
Self.outputer = Html_outputer. Htmloutputer ()
Def Craw (Self,root_url):
Count = 1
Self.urls.add_new_url (Root_url)
While Self.urls.has_new_url ():
Try
New_url = Self.urls.get_new_url ()
print ' Craw%d:%s '% (count, New_url)
Html_cont = Self.downloader.download (New_url)
New_urls,new_data = Self.parser.parse (New_url,html_cont)
Self.urls.add_new_urls (New_urls)
Self.outputer.collectdata (New_data)
if Count = = 1000:
Break
Count = Count + 1
Except
print ' Craw Faild '
Self.outputer.output_html ()
if __main__ = = "__main__":
Root_url = "Http://baike.baidu.com/view/21087.htm"
Obj_spider = Spidermain ()
Obj_spider.craw (Root_url)
A second file: html_dowanloader.py
# CODING=GBK
Import Urllib2
Class Htmldownloader (object):
def download (self,url):
If URL is None:
Return None
Response = Urllib2.urlopen (URL)
If Response.getcode ()! = 200:
Return None
Return Response.read ()
Third file:html_parser.py
# CODING=GBK
From BS4 import BeautifulSoup
Import re
Import Urlparse
Class Htmlparser (object):
Def parse (Self,page_url,html_cont):
If Page_url is None or Html_cont is none:
Return
Soup = BeautifulSoup (Html_cont, ' Html.parser ', form_encoding= ' utf-8 ')
New_urls = Self._get_new_urls (Page_url,soup)
New_data = Self._get_new_data (Page_url,soup)
Return New_urls,new_data
def _get_new_urls (Self,page_url,soup):
New_urls = set ()
#/view/123.htm
Links = soup.find_all (' A ', Href=re.compile (r "/view/\d+\.htm"))
For link in Links:
New_url = link[' href ']
New_full_url = Urlparse.urljoin (Page_url,new_url)
New_urls.add (New_full_url)
Return New_urls
def _get_new_data (Self,page_url,soup):
#创建一个字典, for storing data
Res_data ={}
#
res_data[' url ']= page_url
#<dd class= "Lemmawgt-lemmatitle-title" >
Title_node = Soup.find (' dd ', class_= "Lemmawgt-lemmatitle-title"). Find ("H1")
res_data[' title '] = Title_node.get_text ()
Summary_node = Soup.find (' div ', class_= "Lemma-summary")
res_data[' summary '] = Summary_node.get_text ()
Return Res_data
Fourth file: url_manager.py
# CODING=GBK
Class Urlmanager (object):
def __init__ (self):
Self.new_urls = set ()
Self.old_urls = set ()
def add_new_url (Self,url):
If URL is None:
Return
If URL not in Self.new_urls and URL not in Self.old_urls:
Self.new_urls.add (URL)
def add_new_urls (Self,urls):
If URLs is None or len (URLs) ==0:
Return
For URL in URLs:
Self.add_new_url (URL)
def has_new_url (self):
Return len (self.new_urls)! = 0
def get_new_url (self):
New_url = Self.new_urls.pop ()
Self.old_urls.add (New_url)
Return New_url
Fifth file:html_outputer.py
# CODING=GBK
Class Htmloutputer (object):
def __init__ (self):
Self.datas = []
def collect_data (Self,data):
If data is None:
Return
Self.datas.append (data)
def output_html (self):
Fout = open (' output.html ', ' W ')
Fout.write ("
Fout.write ("<body>")
Fout.write ("<table>")
For data in Self.datas:
Fout.write ("<tr>")
Fout.write ("<td>%s</td>"% data[' url ')
Fout.write ("<td>%s</td>"% data[' title '].encode (' utf-8 '))
Fout.write ("<td>%s</td>"% data[' summary '].encode (' utf-8 '))
Fout.write ("</tr>")
Fout.write ("</table>")
Fout.write ("</body>")
Issue 1: After writing, there is a statement error, but later on you have no problem
Title_node = Soup.find (' dd ', class_= "Lemmawgt-lemmatitle-title"). Find ("H1")
Question 2: The current code does not have an error, but can not be run, perhaps because the logic is not fully analyzed, and later to modify the details, so that the project can run
This article is from the "Wild Goat" blog, please be sure to keep this source http://yeshanyang.blog.51cto.com/8845896/1772308
003 Writing the first project in Eclipse: Web crawler