Import requests
Import re #正则表达式
Import time
Import Pandas #保存成 CSV
#header ={' user-agent ': ' mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) gecko/20100101 firefox/57.0 '}
Header = {' user-agent ': ' mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) gecko/20100101 firefox/57.0 ', ' Cookie ': ' Jsessionid=abaaabaaadeaafi1e0f9e93b802b158b671ed843bed6de5; hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511754333; hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511765381; _ga=ga1.2.1259821048.1511754333; User_trace_token=20171127114539-6f956704-d325-11e7-ac7d-525400f775ce; lgrid=20171127144946-28372596-d33f-11e7-9a81-5254005c3644; Lguid=20171127114539-6f956cbc-d325-11e7-ac7d-525400f775ce; _gid=ga1.2.644825101.1511754336; X_http_token=2eb2d7bfeb14d998ae1bc4ce0efdc0f8; _putrc=59b1d3cedbe5250a; Login=true; unick=%e6%9c%b1%e4%b8%9c%e5%8d%8e; Showexpriedindex=1; showexpriedcompanyhome=1; showexpriedmypublish=1; hasdeliver=0; index_location_city=%e5%b9%bf%e5%b7%9e; Tg-track-code=search_code; Search_id=826f4d81a0324508892895d9400bffab ', ' Host ': ' www.lagou.com '}
#模拟浏览器访问
Url= ' https://www.lagou.com/zhaopin/4/?filterOption=4 '
Html=requests.request (' GET ', URL, headers=header). Text
#请求拉勾网的URL, gets its text.
Ren=re.compile (R ' data-salary= "(. *?)" data-company= "(. *?)" Data-positionname= "(. *?)" href= "(. *?)" ', Re. S
#正则表达式查找
data = Pandas. DataFrame (Re.findall (ren,html)) #csv
#爬取多页
Data=[] #赋予一个列表的格式
For II in range (1,50):
New_url = ' https://www.lagou.com/zhaopin/' + str (ii)
Time.sleep (2)
Html=requests.request (' GET ', New_url, Headers=header)
data = Pandas. DataFrame (Re.findall (ren, html.text)) #csv
Data.to_csv (' c:\\users\\administrator\\desktop\\python\\lagou1.csv ', header=false,index=false,mode= ' A + ')
The following Excel columns are processed:
Python crawler 2-Regular expression Grab hook net job information