Simple crawlers crawl 51job jobs and crawlers crawl 51job jobs
1 #-*-coding: UTF-8-*-2 from urllib import request 3 from bs4 import BeautifulSoup 4 from urllib import parse 5 import pymysql 6 url = "http://search.51job.com/jobsearch/search_result.php" 7 rep = request. request (url) 8 rep. add_header ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 ") 9 rep. add_header ("Origin", "http://search.51job.com") 10 postData = parse. urlencode ([11 ("fromJs", "1"), 12 ("jobarea", "040000"), 13 ("keyword", "python "), 14 ("keywordtype", "2"), 15 ("lang", "c"), 16 ("stype", "2"), 17 ("postchannel ", "0000"), 18 ("fromType", "1"), 19 ("confirmdate", "9") 20]) 21 print (postData) 22 return _ = request. urlopen (rep, data = postData. encode ("gbk") 23 content = return _. read (). decode ("gb18030") 24 sp = BeautifulSoup (content, "html. parser ") 25 f = open (" B .txt ", 'w') 26 27 info_set = set ([]) 28 j = 029 for I in sp. find ("div", class _ = "dw_table "). find_all ("div", class _ = "el"): 30 if j = 0: 31 j = j + 132 continue; 33 j = j + 134 content = I. find (""). get_text (). strip () + "*" + I. find ("span", class _ = "t2 "). string + "*" + I. find ("span", class _ = "t3 "). string + "*" + I. find ("span", class _ = "t4 "). string + "*" + I. find ("span", class _ = "t5 "). string + "\ n" 35 f. write (str (content) 36 print ("downloaded") 37 print (info_set) 38 f. close () 39 # difficult Paging
View Code