BeautifulSoup Regular lxml
#-*-Coding:utf-8-*-import refrom urllib.request import urlopenfrom urllib.request import requestfrom BS4 import Beauti Fulsoupfrom lxml Import etree# add analog Browser protocol Header headers = {' user-agent ': ' mozilla/5.0 (Windows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '}url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA% Ac&kw=python&sm=0&p=1 "req_timeout = 5req = Request (url=url,headers=headers) F = Urlopen (req,none,req_ Timeout) s = f.read () s = S.decode (' utf-8 ') SS = str (s) #lxml提取selector = etree. HTML (ss) links = Selector.xpath ('//tr/td[@class = "ZWMC"]/div/a/@href |//tr/td[@class = "ZWMC"]/div/a/text () ') for link In Links:print (link) "#beautifulsoup提取soup = BeautifulSoup (ss, ' Html.parser ') alist = Soup.find_all (" tr ") for item in Alist:alist1 = Item.find_all ("a") for Item1 in Alist1:print (item1.get (' href ')) print (Item1.get_text ()) Break#print ( Item) #print (Item.get (' href ')) #print (Item.get_text ()) "#正则提取" "mm = Re.findall (' <div style=" width:224px;*width : 218px; _width:200px; Float:left "><a style=\" font-weight:bold\ "par=\" (. *) \ "Href=\" (. *) \ "target=\" _blank\ "> (. *) </a> ', SS ) print (mm) "
Python extract page information BeautifulSoup regular lxml