Crawl Zhaopin Recruitment Information
Last Update:2018-07-24
Source: Internet
Author: User
#!/usr/bin/python
#encoding: Utf-8
Import requests
From BS4 import BeautifulSoup
Import Codecs
Import XLWT
From xlutils.copy Import copy
From XLRD import Open_workbook
Import OS
Class Spider ():
def __init__ (self):
Self.url = ' http://sou.zhaopin.com/jobs/searchresult.ashx? '
self.headers={
' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ',
' accept-encoding ': ' gzip, deflate, SDCH ',
' Accept-language ': ' zh-cn,zh;q=0.8 ',
' Cache-control ': ' max-age=0 ',
' Connection ': ' Keep-alive ',
' User-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/57.0.2987.133 safari/537.36 '
}
self.data={
' Ji ': ' Shanghai ',
' kw ': ' Big Data ',
' P ': 1,
' Isadv ': 0
}
Self.filename= './zlzp.xls '
F=XLWT. Workbook (encoding= ' Utf-8 ')
Sheet1=f.add_sheet (' Sheet1 ')
row=[' position ', ' company ', ' salary ', ' address ', ' data '
For I in range (len (row)):
Sheet1.write (0,i,row[i])
F.save (Self.filename)
def zlzp (self):
Html=requests.get (Self.url,headers=self.headers,params=self.data)
Soup=beautifulsoup (Html.text, ' Html.parser ')
# F=codecs.open ('./zhilian.html ', ' w ', ' Utf-8 ')
# F.write (Html.text)
# F.close ()
Newlist=soup.find (' div ', {' class ': ' Newlist_list_content '})
Tables=newlist.findall (' table ', {' NewList '})
line = 1
For I in range (1,len (tables)):
Table=tables[i]
Link=table.find (' a ') [' href ']
Link=str (link)
Position, company, salary, address, data=self.get_info (link)
Print position, company, salary, address, data
Rb=open_workbook (Self.filename)
Wb=copy (RB)
Jobdata=[position.decode (' Utf-8 '), Company.decode (' Utf-8 '), Salary.decode (' Utf-8 '), Address.decode (' Utf-8 '), Data.decode (' Utf-8 ')]
Sheet=wb.get_sheet (0)
For j in Range (Len (jobdata)):
Sheet.write (Line,j,jobdata)
Line+=1
Os.remove (Self.filename)
Wb.save (Self.filename)
def get_info (Self,link):
Header = {
' Accept ': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 ',
' accept-encoding ': ' gzip, deflate, SDCH ',
' Accept-language ': ' zh-cn,zh;q=0.8 ',
' Cache-control ': ' max-age=0 ',
' Connection ': ' Keep-alive ',
' User-agent ': ' mozilla/5.0 (Windows NT 6.1; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/57.0.2987.133 safari/537.36 '
}
Html=requests.get (Link,headers=header)
Soup=beautifulsoup (Html.text, ' Html.parser ')
Try
Tfb=soup.find (' div ', {' class ': ' Top-fixed-box '})
Position=tfb.find (' H1 '). Text.encode (' Utf-8 ') #地点
Company=tfb.find (' H2 '). Text.encode (' Utf-8 ') #公司
Tpl=soup.find (' div ', {' class ': ' Terminalpage-left '})
Tuc=tpl.find (' ul ', {' class ': ' Terminal-ul clearfix '})
Lis=tuc.findall (' Li ')
Salary=lis[0].find (' strong '). Text.encode (' Utf-8 ') #工资
Address=lis[1].find (' strong '). Text.encode (' Utf-8 ') #地点
Data=lis[2].find (' strong '). Text.encode (' Utf-8 ') #日期
Return Position,company,salary,address,data
Except Exception as E:
Print E
if __name__ = = ' __main__ ':
Spider=spider ()
Spider. ZLZP ()