The code is as follows:
From creepy import Crawler
From BeautifulSoup import BeautifulSoup
Import Urllib2
Import JSON
Class Mycrawler (Crawler):
def process_document (self, doc):
if Doc.status = = 200:
print ' [%d]%s '% (Doc.status, Doc.url)
Try
Soup = BeautifulSoup (Doc.text.decode (' gb18030 '). Encode (' Utf-8 '))
Except Exception as E:
Print E
Soup = BeautifulSoup (doc.text)
Print Soup.find (id= "Product-intro"). Div.h1.text
Url_id=urllib2.unquote (Doc.url). Decode (' UTF8 '). Split ('/') [ -1].split ('. ') [0]
f = urllib2.urlopen (' http://p.3.cn/prices/get?skuid=J_ ' +url_id,timeout=5)
Price=json.loads (F.read ())
F.close ()
Print price[0][' P ']
Else
Pass
Crawler = Mycrawler ()
Crawler.set_follow_mode (Crawler.f_same_host)
Crawler.set_concurrency_level (16)
Crawler.add_url_filter (' \. ( jpg|jpeg|gif|png|js|css|swf) $ ')
Crawler.crawl (' http://item.jd.com/982040.html ')