#-*-coding:utf-8-*-#@Time: 2018/3/1 16:38#@Author: HT#@Email: [Email protected]#@File: 51job.py#@Software: PycharmImportUrllibImportReImportsysreload (SYS) sys.setdefaultencoding ('UTF8') I= 0#Statistics crawl Total entriesdefurl_input (URL): get_html=urllib.urlopen (URL) read_html= Get_html.read (). Decode ('GBK') returnread_htmldeffind_data (HTML):#reg = Re.compile (R ' class= "T1" >.*?<a target= "_blank" title= "(. *?)" <span class= "T2" ><a target= "_blank" title= "(. *?)". *?<span class= "T3" > (. *?) </span>.*?<span class= "T4" > (. *?) </span><span class= "T5" > (. *?) </span> ', Re. S)Reg = Re.compile (r'class= "T1" >.*?<a target= "_blank" title= "(. *?)". *?<span class= "T2" ><a target= "_blank" title= "(. *?)". *?<span class= "T3" > (. *?) </span>.*?<span class= "T4" > (. *?) </span>.*?<span class= "T5" > (. *?) </span>', Re. S) Items=Re.findall (reg,html)returnItemsdeffind_all_page (HTML):#print (HTML)Reg = Re.compile (r'<span class= "TD" > (. *?) </span><input id= "Jump_page" class= "Mytxt" type= "text" value= "1"/>', Re. S) Page_all=Re.findall (reg, HTML) num= Re.sub ("\d","", Page_all[0])#extract numbers from a total of 5 pages returnNumdefdata_to_txt (str): With open (U"51job North Canton Deep Python.txt",'A +') as F:f.write (str)defPrint_items (data_items):GlobalI forDatainchData_items:job=Data[0] Company= Data[1] Address= Data[2] Wages= Data[3] Date= Data[4] I= i + 1str1="["+str (i) +"] "+ job+"--"+company+"--"+address+"--"+wages+"--"+date+"\ n"data_to_txt (str1)Print(STR1)defUrlformat (urlstart): URL= Re.sub ('1.html','{}.html', Urlstart)returnURLdefget_page_html (page_num,urlstart): List=[] forIinchRange (page_num): URL=urlformat (urlstart) URL=Url.format (i) list.append (URL)returnListif __name__=='__main__': #python #Urlstart = ' http://search.51job.com/list/010000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591% 25e5%25b7%25a5%25e7%25a8%258b%25e5%25b8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99 &cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0& radius=-1&ord_field=0&confirmdate=9&fromtype=&dibiaoid=0&address=&line=& Specialarea=00&from=&welfare= ' #Embedded #Urlstart = ' Http://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25B5%258C%25E5%2585%25A5%25E5%25BC %258f%25e5%25bc%2580%25e5%258f%2591,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype =99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2c0&radius=-1 &ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00& From=&welfare= ' #Cloud Computing #Urlstart = ' Http://search.51job.com/list/010000,000000,0000,00,9,99,%25E4%25BA%2591%25E8%25AE%25A1%25E7%25AE %2597,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99& Jobterm=99&companysize=99&lonlat=0%2c0&radius=-1&ord_field=0&confirmdate=9&fromtype=1 &dibiaoid=0&address=&line=&specialarea=00&from=&welfare= ' #Machine Learning #Urlstart = ' Http://search.51job.com/list/010000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD %25a6%25e4%25b9%25a0,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99& Degreefrom=99&jobterm=99&companysize=99&lonlat=0%2c0&radius=-1&ord_field=0&confirmdate =9&fromtype=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= ' #Artificial Intelligence #Urlstart = ' http://search.51job.com/list/010000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599 %25ba%25e8%2583%25bd,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99& Degreefrom=99&jobterm=99&companysize=99&lonlat=0%2c0&radius=-1&ord_field=0&confirmdate =9&fromtype=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= ' #Automatic Driving #Urlstart = ' http://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E9%25A9 %25be%25e9%25a9%25b6,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99& Degreefrom=99&jobterm=99&companysize=99&lonlat=0%2c0&radius=-1&ord_field=0&confirmdate =9&fromtype=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= ' #North Canton Deep PythonUrlstart ='http://search.51job.com/list/010000%252C040000%252C020000%252C030200,000000,0000,00,9,99,python,2,1.html? lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99& Companysize=99&lonlat=0%2c0&radius=-1&ord_field=0&confirmdate=9&fromtype=1&dibiaoid=0 &address=&line=&specialarea=00&from=&welfare='HTML=url_input (urlstart) All_page_num=int (find_all_page (HTML))Print("+++++++++++++++++%s++++++++++++++++++++"%(all_page_num)) Urllist=get_page_html (All_page_num,urlstart) forUrlinchurllist:html=url_input (URL) data_items=find_data (HTML) print_items (Data_items)
View Code
/span>
Python crawl 51job job information