Copy CodeThe code is as follows:
#!/usr/bin/python
#-*-coding:utf-8-*-
# Jcrawler
# Author:jam <810441377@qq.com>
Import time
Import Urllib2
From BS4 import BeautifulSoup
# target Site
Targethost = "http://adirectory.blog.com"
# User Agent
useragent = ' mozilla/5.0 (X11; Linux x86_64) applewebkit/537.36 (khtml, like Gecko) chrome/33.0.1750.117 safari/537.36 '
# link acquisition Rule
# Directory link acquisition rule
Categoryfind = [{' Findmode ': ' Find ', ' findtag ': ' div ', ' rule ': {' id ': ' Cat-nav '}},
{' Findmode ': ' FindAll ', ' Findtag ': ' A ', ' rule ': {}}]
# Article link acquisition rule
Articlelistfind = [{' Findmode ': ' Find ', ' findtag ': ' div ', ' rule ': {' id ': ' Content '}},
{' Findmode ': ' FindAll ', ' findtag ': ' H2 ', ' rule ': {' class ': ' title '}},
{' Findmode ': ' FindAll ', ' findtag ': ' A ', ' Rule ': {}}]
# paging URL rule
pageurl = ' page/#page/'
Pagestart = 1
pagestep = 1
pagestophtml = ' 404:page not Found '
def gethtmltext (URL):
Request = Urllib2. Request (URL)
Request.add_header (' Accept ', ' TEXT/HTML,APPLICATION/XHTML+XML,APPLICATION/XML;Q=0.9,IMAGE/WEBP ')
Request.add_header (' accept-encoding ', "*")
Request.add_header (' user-agent ', useragent)
return Urllib2.urlopen (Request). Read ()
def arrtostr (Vararr):
Returnstr = ""
For S in Vararr:
Returnstr + = str (s)
Return RETURNSTR
def gethtmlfind (HTMLText, Findrule):
Findreturn = BeautifulSoup (htmltext)
Returntext = ""
For F in Findrule:
If returntext! = "":
Findreturn = BeautifulSoup (Returntext)
If f[' findmode '] = = ' Find ':
Findreturn = Findreturn.find (f[' Findtag '], f[' rule ')
If f[' findmode '] = = ' FindAll ':
Findreturn = Findreturn.findall (f[' Findtag '], f[' rule ')
Returntext = Arrtostr (Findreturn)
Return Findreturn
Def getcategory ():
Categorys = [];
HTMLText = Gethtmltext (targethost)
Findreturn = Gethtmlfind (HTMLText, Categoryfind)
For tag in Findreturn:
Print "[g]->category:" + tag.string + "| URL: "+ tag[' href ')
Categorys.append ({' name ': tag.string, ' url ': tag[' href ']})
return categorys;
def getarticlelist (Categoryurl):
articles = []
page = Pagestart
#pageUrl = Pageurl
While True:
HTMLText = ""
Pageurl = Pageurl.replace ("#page", str (page))
Print "[G]->pageurl:" + Categoryurl + pageurl
While True:
Try
HTMLText = Gethtmltext (Categoryurl + pageurl)
Break
Except Urllib2. Httperror,e:
Print "[E]->http Error:" + str (e.code)
if E.code = = 404:
HTMLText = pagestophtml
Break
if E.code = = 504:
Print "[E]->http Error 504:gateway time-out, Wait"
Time.sleep (5)
Else
Break
If Htmltext.find (pagestophtml) >= 0:
Print "End page."
Break
Else
Findreturn = Gethtmlfind (HTMLText, Articlelistfind)
For tag in Findreturn:
If tag.string! = None and tag[' href '].find (targethost) >= 0:
Print "[g]->article:" + tag.string + "| URL: "+ tag[' href ')
Articles.append ({' name ': tag.string, ' url ': tag[' href ']})
Page + = 1
return articles;
print "[G]->getcategory"
Mycategorys = GetCategory ();
Print "[G]->getcategory->success."
Time.sleep (3)
For category in Mycategorys:
Print "[g]->getarticlelist:" + category[' name ']
Getarticlelist (category[' url ')