Crawl http://bj.58.com/pbdn/0/pn2/In addition to the transfer, promotion of products other than the product information, because the details of the transfer and promotion information is not standardized, you need to write another method of storage, late replenishment, the details of the following pages
This week learned the crawler, but encountered some js,requests methods can not be rendered, such as the number of views, so combined with SELENIUM+PHANTOMJS rendering Web page, get information
On the code, in the comments explained in detail:
From selenium import Webdriver
From BS4 import BeautifulSoup
Import re
Class GetPageInfo (object):
' This class is primarily defined as a way to crawl Web pages, define a specification, and later crawl the page method to rewrite the class on the line '
def index_page (self):
' Get URLs for every page '
' This week, we only crawl all of the record details for one page at a time, so this method is temporarily unused '
Pass
def detail_page (self):
' From each index_page, get the URL of each record of each page '
Pass
def domain_page (self):
' Get details of each record from each detail_page '
Class Tongchengfirsthomework (GetPageInfo):
#传入一个浏览器
def __init__ (Self,browser=none):
Self.browser= Browser #初始化一个浏览器
def detail_page (self,whoshell=0,page=1):
' woshell:0 on behalf of personal sales, 1 for Business sales; page Current page '
#http://sz.58.com/pbdn/0/pn1/
Url= ' http://sz.58.com/pbdn/{}/pn{}/'. Format (str (whoshell), str (page))
#此处使用了format函数, detailed usage can find Baidu
Browser.get (URL) #打开网页
HTML = Browser.page_source #获取网页的所有内容
Soup = beautifulsoup (html, ' lxml ') #用BeautifulSoup解析网页, back to the reptile method we normally know
Detail_urls = Soup.select (' #infolist a.t ') #获取某一页下的所有记录
Detail_url_list=[]
For row in Detail_urls:
Detail_url = row.get (' href ')
if (' Mzhuanzhuan ' not in str (DETAIL_URL)) and (' Jump ' not in Detail_url):
#获取每一条记录的url
Detail_url_list.append (Detail_url)
Print (detail_url_list)
Return detail_url_list
def domain_page (Self,detail_url):
Detail = {} #用来存放详细信息的字典
Browser.get (Detail_url)
HTML = Browser.page_source
Soup = beautifulsoup (html, ' lxml ')
Desc_product = Soup.select (' div.col_sub.sumary > Ul > Li:nth-of-type (2) > Div.su_con > Span ')
detail={
"Provice": Soup.select ('. crb_i > A ') [0].get_text (),
"title": Soup.select (' #content > Div.person_add_top.no_ident_top > Div.per_ad_left > Div.col_sub.mainTitle > H1 ') [0].get_text (),
"Date": Soup.select ('. Time ') [0].get_text (),
"Views": Soup.select (' #totalcount ') [0].get_text (),
"Price": Soup.select (' Span.price.c_f50 ') [0].get_text (),
"Condition": List (desc_product[0].stripped_strings) if '-' not in Str (soup.select (' Div.col_sub.sumary > Ul > Li: Nth-of-type (2) > Div.su_con > Span ')) Else None,
"Area": List (Soup.select ('. c_25d ') [0].stripped_strings) if Soup.find_all (' span ', ' c_25d ') Else None,
#这里是可以直接在字典里使用if函数, List-like parsing
"Seller": Soup.select (' #divContacter > UL > Ul > li > A ') [0].get_text (),
}
Print (detail)
return Detail #返回所有详细信息
Try
Cap = Webdriver. Desiredcapabilities.phantomjs #DesiredCapabilities是一个字典, the browser can be set
cap[' phantomjs.page.settings.loadImages ']=false #设置浏览器不加载图片
cap[' phantomjs.page.settings.userAgent ']= "mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/45.0.2454.101 safari/537.36 "#设置useragent
cap[' Phantomjs.page.settings.diskCache ' =true #设置浏览器开启缓存
Browser = Webdriver. PHANTOMJS (DESIRED_CAPABILITIES=CAP) #使用desired_capabilities初始化浏览器
Tongcheng=tongchengfirsthomework (browser) #实例一个tongchengfirskhomework对象
For Detail_page in Tongcheng.detail_page (page=2):
Tongcheng.domain_page (Detail_page)
Print (Detail_page)
Finally
Browser.close () #记得要关掉浏览器
Python3.4+selenium climb 58 The same city (i)