#!/usr/bin/python
#-*-coding:utf-8-*-
# Jcrawler
# Author:jam <810441377@qq.com>
Import time
Import Urllib2
From BS4 import BeautifulSoup
# target Site
Targethost = "Http://adirectory.blog.com"
# User Agent
useragent = ' mozilla/5.0 (X11; Linux x86_64) applewebkit/537.36 (khtml, like Gecko) chrome/33.0.1750.117 safari/537.36 '
# Link Collection Rules
# Directory Link Collection rules
Categoryfind = [{' Findmode ': ' Find ', ' findtag ': ' div ', ' rule ': {' id ': ' Cat-nav '}},
{' Findmode ': ' FindAll ', ' findtag ': ' A ', ' rule ': {}}]
# article link Collection rules
Articlelistfind = [{' Findmode ': ' Find ', ' findtag ': ' div ', ' rule ': {' id ': ' Content '}},
{' Findmode ': ' FindAll ', ' findtag ': ' H2 ', ' rule ': {' class ': ' title '}},
{' Findmode ': ' FindAll ', ' findtag ': ' A ', ' rule ': {}}]
# Paging URL Rule
Pageurl = ' page/#page/'
Pagestart = 1
Pagestep = 1
pagestophtml = ' 404:page not Found '
def gethtmltext (URL):
Request = Urllib2. Request (URL)
Request.add_header (' Accept ', "TEXT/HTML,APPLICATION/XHTML+XML,APPLICATION/XML;Q=0.9,IMAGE/WEBP")
Request.add_header (' accept-encoding ', "*")
Request.add_header (' user-agent ', useragent)
return Urllib2.urlopen (Request). Read ()
def arrtostr (Vararr):
Returnstr = ""
For S in Vararr:
Returnstr + + str (s)
Return RETURNSTR
def gethtmlfind (HTMLText, findrule):
Findreturn = BeautifulSoup (htmltext)
Returntext = ""
for F in Findrule:
if Returntext!= "":
Findreturn = BeautifulSoup ( Returntext)
if f[' findmode '] = = ' Find ':
Findreturn = findreturn.find (f[' Findtag '], f[' rule ')
if f[' findmode '] = = ' FindAll ':
Findreturn = Findreturn.findall (f[' Findtag '], f[' rule ')
Returntext = arrtostr (findreturn)
return Findreturn
Def getcategory ():
Categorys = [];
HTMLText = Gethtmltext (targethost)
Findreturn = Gethtmlfind (HTMLText, Categoryfind)
For tag in Findreturn:
Print [g]->category: + tag.string +] | URL: "+ tag[' href ']
Categorys.append ({' name ': tag.string, ' url ': tag[' href ']})
return categorys;
def getarticlelist (Categoryurl):
articles = []
page = Pagestart
#pageUrl = Pageurl
While True:
HTMLText = ""
Pageurl = Pageurl.replace ("#page", str (page))
Print "[G]->pageurl:" + Categoryurl + pageurl
While True:
Try
HTMLText = Gethtmltext (Categoryurl + pageurl)
Break
Except Urllib2. Httperror,e:
Print "[E]->http Error:" + str (e.code)
If E.code = 404:
HTMLText = pagestophtml
Break
If E.code = 504:
Print "[E]->http Error 504:gateway time-out, wait"
Time.sleep (5)
Else
Break
If Htmltext.find (pagestophtml) >= 0:
Print "End Page."
Break
Else
Findreturn = Gethtmlfind (HTMLText, Articlelistfind)
For tag in Findreturn:
If tag.string!= None and tag[' href '].find (targethost) >= 0:
Print [g]->article: + tag.string +] | URL: "+ tag[' href ']
Articles.append ({' name ': tag.string, ' url ': tag[' href ']})
page = 1
return articles;
print "[G]->getcategory"
Mycategorys = GetCategory ();
Print "[G]->getcategory->success."
Time.sleep (3)
For category in Mycategorys:
Print [g]->getarticlelist: + category[' name ']
Getarticlelist (category[' url ')