Crawls Zhaopin recruitment based on the scrapy framework -- stores the information, and scrapy Zhaopin
1. Create a new file directly on the Terminal in the previously crawled JobSpider.
Scrapy genspider zlzp baidu.com
2. Start parsing data
1) roughly plan the functions required
2) function 1 redirects to function 2 Using yield scrapy. Request (url, callback, meta, dont_filter)
#-*-Coding: UTF-8-*-import scrapyfrom .. items import JobspiderItem # obtain the class ZlzpSpider (scrapy. spider): name = 'zlzp' allowed_domains = ['zhaopin. com '] start_urls = ['HTTP: // sou.zhaopin.com/jobs/searchresult.ashx? Jl = % E5 % 8C % 97% E4 % BA % AC % 2B % E4 % B8 % 8A % E6 % B5 % B7 % 2B % E5 % B9 % BF % E5 % B7 % 9E % 2B % E6 % B7 % B1 % E5 % 9C % B3 % 2B % E6 % AD % A6 % E6 % B1 % 89 & kw = python & p = 1 & isadv = 0 ', 'http: // sou.zhaopin.com/jobs/searchresult.ashx? Jl = % E5 % 8C % 97% E4 % BA % AC % 2B % E4 % B8 % 8A % E6 % B5 % B7 % 2B % E5 % B9 % BF % E5 % B7 % 9E % 2B % E6 % B7 % B1 % E5 % 9C % B3 % 2B % E6 % AD % A6 % E6 % B1 % 89 & kw = php & p = 1 & isadv = 0 ', 'http: // sou.zhaopin.com/jobs/searchresult.ashx? Jl = % E5 % 8C % 97% E4 % BA % AC % 2B % E4 % B8 % 8A % E6 % B5 % B7 % 2B % E5 % B9 % BF % E5 % B7 % 9E % 2B % E6 % B7 % B1 % E5 % 9C % B3 % 2B % E6 % AD % A6 % E6 % B1 % 89 & kw = html & p = 1 & isadv = 0'] def parse (self, response): yield scrapy. request (url = response. url, callback = self. parse_job_info, meta = {}, dont_filter = True,) def parse_job_info (self, response): "" parsing job information: param response: return: "" zl_table_list = response. xpath ("// div [@ id = 'newlist _ list_content_table ']/table [@ class = 'newlist']") for zl_table in zl_table_list [1:]: # tbody is automatically generated by the web page to see the results/or right-click to view the source code # zl_td_list = zl_table.xpath ("tr [1]/td") # problem: the number of td is not five, an error will be reported -- the index is out of bounds # td1 = zl_table_list [0] # td2 = zl_table_list [1] # td3 = zl_table_list [2] # td4 = zl_table_list [3] # td5 = zl_table_list [# Try to use xpath to locate the search element, use less indexes because there may be an index out-of-bounds Error # use exception capture only when unclear errors exist # // text () to get all text in the tag # extract () convert the elements in the list to text. In itself, the list is still list # extract_first ('default'). Convert the elements in the list to text and retrieve the first one. If not, the default value td1 = zl_table.xpath ("tr/td [@ class = 'zwmc ']/div/a // text ()") is returned ()"). extract () # map returns a list td1 = list (map (str. strip, td1) td1 = map (str. strip, td1) job_name = "". join (td1 ). replace (",", "/") # strip () only fan_kui_lv = zl_table.xpath ("tr/td [@ class = 'fk _ lv']/span/text ()") at both ends can be cleared ()"). extract_first ('no feedback rate '). strip () job_company_name = zl_table.xpath ("tr/td [@ class = 'gsmc ']/a [1]/text ()"). extract_first ('no company name '). strip () job_salary = zl_table.xpath ("tr/td [@ class = 'zwyx']/text ()"). extract_first ('negotiable '). strip () job_place = zl_table.xpath ("tr/td [@ class = 'gzdd']/text ()"). extract_first ('no workspace '). strip () print (job_name, fan_kui_lv, job_company_name, job_salary, job_place) item = JobspiderItem () item ['job _ name'] = job_name item ['job _ company_name '] = job_company_name item ['job _ place'] = job_place item ['job _ salary'] = job_salary item ['job _ time'] = "no time" item ['job _ type'] = "Zhaopin recruitment" item ['fan _ kui_lv '] = fan_kui_lv yield item yield scrapy. request (url = response. url, callback = self. parse_next_page, meta = {}, dont_filter = True,) def parse_next_page (self, response): "parse next page: param response: return: "# // div [@ class = 'pageslow']/ul/li/a [text () = 'Next page']/@ href next_page = response. xpath ("// a [text () = 'Next page']/@ href "). extract_first ('no next') if next_page: yield scrapy. request (url = next_page, callback = self. parse_job_info, meta = {}, dont_filter = True ,)
3. Other files in JobSpider can be directly used without setting.
4. The running result is as follows: