003 Writing the first project in Eclipse: Web crawler

Source: Internet
Author: User

Description: Support Chinese # CODING=GBK

Create Project: File->new->project->pydev->pydev Project

New project name: arbitrary. New Package Name: Wangyepachong

Create a 5 module file:

All modules used in the project need to be added in Eclipse: Windows->preferences->pydev->interpreters->python interpreter find forced Add the modules that need to be imported in Builtins:

For example: BS4 urllib2 Re, etc.


html_dowanloader.py

html_outputer.py

html_parser.py

spider_main.py

url_manager.py



First file:spider_main.py


# CODING=GBK


‘‘‘

Simple crawler Architecture

URL Manager

Web Downloader, Urllib2

Web parser: BeautifulSoup


This is just the simplest crawler!!!

--late, need to login, verification code, AJAX, server anti-crawler, multi-threading, distributed

‘‘‘


Import __main__

From Wangyepachong import Url_manager

From Wangyepachong import Html_downloader

From Wangyepachong import Html_parser

From Wangyepachong import Html_outputer



Class Spidermain(object):

def __init__ (self):

Self.urls = Url_manager. Urlmanager ()

Self.downloader = Html_downloader. Htmldownloader ()

Self.parser = Html_parser. Htmlparser ()

Self.outputer = Html_outputer. Htmloutputer ()

Def Craw (Self,root_url):

Count = 1

Self.urls.add_new_url (Root_url)

While Self.urls.has_new_url ():

Try

New_url = Self.urls.get_new_url ()

print ' Craw%d:%s '% (count, New_url)

Html_cont = Self.downloader.download (New_url)

New_urls,new_data = Self.parser.parse (New_url,html_cont)

Self.urls.add_new_urls (New_urls)

Self.outputer.collectdata (New_data)

if Count = = 1000:

Break

Count = Count + 1

Except

print ' Craw Faild '

Self.outputer.output_html ()

if __main__ = = "__main__":

Root_url = "Http://baike.baidu.com/view/21087.htm"

Obj_spider = Spidermain ()

Obj_spider.craw (Root_url)

A second file: html_dowanloader.py

# CODING=GBK


Import Urllib2

Class Htmldownloader (object):

def download (self,url):

If URL is None:

Return None

Response = Urllib2.urlopen (URL)

If Response.getcode ()! = 200:

Return None

Return Response.read ()

Third file:html_parser.py

# CODING=GBK

From BS4 import BeautifulSoup

Import re

Import Urlparse



Class Htmlparser (object):

Def parse (Self,page_url,html_cont):

If Page_url is None or Html_cont is none:

Return

Soup = BeautifulSoup (Html_cont, ' Html.parser ', form_encoding= ' utf-8 ')

New_urls = Self._get_new_urls (Page_url,soup)

New_data = Self._get_new_data (Page_url,soup)

Return New_urls,new_data

def _get_new_urls (Self,page_url,soup):

New_urls = set ()

#/view/123.htm

Links = soup.find_all (' A ', Href=re.compile (r "/view/\d+\.htm"))

For link in Links:

New_url = link[' href ']

New_full_url = Urlparse.urljoin (Page_url,new_url)

New_urls.add (New_full_url)

Return New_urls

def _get_new_data (Self,page_url,soup):

#创建一个字典, for storing data

Res_data ={}

#

res_data[' url ']= page_url

#<dd class= "Lemmawgt-lemmatitle-title" >

Title_node = Soup.find (' dd ', class_= "Lemmawgt-lemmatitle-title"). Find ("H1")

res_data[' title '] = Title_node.get_text ()

Summary_node = Soup.find (' div ', class_= "Lemma-summary")

res_data[' summary '] = Summary_node.get_text ()

Return Res_data

Fourth file: url_manager.py

# CODING=GBK


Class Urlmanager (object):

def __init__ (self):

Self.new_urls = set ()

Self.old_urls = set ()

def add_new_url (Self,url):

If URL is None:

Return

If URL not in Self.new_urls and URL not in Self.old_urls:

Self.new_urls.add (URL)

def add_new_urls (Self,urls):

If URLs is None or len (URLs) ==0:

Return

For URL in URLs:

Self.add_new_url (URL)

def has_new_url (self):

Return len (self.new_urls)! = 0

def get_new_url (self):

New_url = Self.new_urls.pop ()

Self.old_urls.add (New_url)

Return New_url

Fifth file:html_outputer.py

# CODING=GBK


Class Htmloutputer (object):

def __init__ (self):

Self.datas = []

def collect_data (Self,data):

If data is None:

Return

Self.datas.append (data)

def output_html (self):

Fout = open (' output.html ', ' W ')

Fout.write ("

Fout.write ("<body>")

Fout.write ("<table>")

For data in Self.datas:

Fout.write ("<tr>")

Fout.write ("<td>%s</td>"% data[' url ')

Fout.write ("<td>%s</td>"% data[' title '].encode (' utf-8 '))

Fout.write ("<td>%s</td>"% data[' summary '].encode (' utf-8 '))

Fout.write ("</tr>")

Fout.write ("</table>")

Fout.write ("</body>")


Issue 1: After writing, there is a statement error, but later on you have no problem

Title_node = Soup.find (' dd ', class_= "Lemmawgt-lemmatitle-title"). Find ("H1")

Question 2: The current code does not have an error, but can not be run, perhaps because the logic is not fully analyzed, and later to modify the details, so that the project can run


This article is from the "Wild Goat" blog, please be sure to keep this source http://yeshanyang.blog.51cto.com/8845896/1772308

003 Writing the first project in Eclipse: Web crawler

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.