用python拉鉤網的完整版

來源:互聯網
上載者:User

標籤:processor   def   param   pos   safari   ash   erer   object_id   filter   

這是在爬取伯樂線上的基礎之上的,所以就沒重複代碼。

在lagou.py

import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom ArticleSpider.utils.common import get_md5from selenium import webdriverimport timeimport picklefrom ArticleSpider.items import LagouJobItemLoader, LagouJobItemfrom datetime import datetimeclass LagouSpider(CrawlSpider):    name = ‘lagou‘    allowed_domains = [‘www.lagou.com‘]    start_urls = [‘https://www.lagou.com/‘]    # headers = {    #     "HOST": "www.lagou.com",    #     "Referer": ‘https://www.lagou.com‘,    #    #     ‘User-Agent‘:"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"    # }    rules = (        Rule(LinkExtractor(allow=r‘gongsi/j/\d+.html‘), follow=True),        Rule(LinkExtractor(allow=r‘zhaopin/.*‘), follow=True),        Rule(LinkExtractor(allow=r‘jobs/\d+.html‘), callback=‘parse_job‘, follow=True),    )    def parse_job(self, response):        #解析拉勾網的職位        item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)        item_loader.add_css("title", ".job-name::attr(title)")        item_loader.add_value("url", response.url)        item_loader.add_value("url_object_id", get_md5(response.url))        item_loader.add_css("salary", ".job_request .salary::text")        item_loader.add_xpath("job_city", "//*[@class=‘job_request‘]/p/span[2]/text()")        item_loader.add_xpath("work_years", "//*[@class=‘job_request‘]/p/span[3]/text()")        item_loader.add_xpath("degree_need", "//*[@class=‘job_request‘]/p/span[4]/text()")        item_loader.add_xpath("job_type", "//*[@class=‘job_request‘]/p/span[5]/text()")        item_loader.add_css("tags", ‘.position-label li::text‘)        item_loader.add_css("publish_time", ".publish_time::text")        item_loader.add_css("job_advantage", ".job-advantage p::text")        item_loader.add_css("job_desc", ".job_bt div")        item_loader.add_css("job_addr", ".work_addr")        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")        item_loader.add_css("company_url", "#job_company dt a::attr(href)")        item_loader.add_value("crawl_time", datetime.now())        job_item = item_loader.load_item()        return job_item    def start_requests(self):        browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")        browser.get("https://passport.lagou.com/login/login.html?service=https%3a%2f%2fwww.lagou.com%2f")        browser.find_element_by_xpath("/html/body/section/div[1]/div[2]/form/div[1]/input").send_keys("account")#需要輸入正確的拉鉤網帳號        browser.find_element_by_xpath("/html/body/section/div[1]/div[2]/form/div[2]/input").send_keys("password")#需要輸入拉鉤網密碼        print(browser.page_source)        browser.find_element_by_xpath("/html/body/section/div[1]/div[2]/form/div[5]").click()        time.sleep(10)        Cookies=browser.get_cookies()        # print(Cookies)        cookie_dict={}        for cookie in Cookies:            f=open(‘C:/Users/Dell/scrapytest/Scripts/ArticleSpider‘+cookie[‘name‘]+‘.lagou‘,‘wb‘)            pickle.dump(cookie,f)            f.close()            cookie_dict[cookie[‘name‘]]=cookie[‘value‘]        browser.close()        return[scrapy.Request(url=self.start_urls[0], dont_filter=True,cookies=cookie_dict)]        # return[scrapy.Request(url=self.start_urls[0], headers=self.headers,dont_filter=True,cookies=cookie_dict)]

在main中

from scrapy.cmdline import executeimport sysimport ossys.path.append(os.path.dirname(os.path.abspath(__file__)))# execute(["scrapy", "crawl", "jobbole"])# execute(["scrapy", "crawl", "zhihu"])execute(["scrapy", "crawl", "lagou"])

在items中

def remove_splash(value):    #去掉工作城市的斜線    return value.replace("/","")def handle_jobaddr(value):    addr_list = value.split("\n")    addr_list = [item.strip() for item in addr_list if item.strip()!="查看地圖"]    return "".join(addr_list)class LagouJobItemLoader(ItemLoader):    #自訂itemloader    default_output_processor = TakeFirst()class LagouJobItem(scrapy.Item):    #拉勾網職位資訊    title = scrapy.Field()    url = scrapy.Field()    url_object_id = scrapy.Field()    salary = scrapy.Field()    job_city = scrapy.Field(        input_processor=MapCompose(remove_splash),    )    work_years = scrapy.Field(        input_processor = MapCompose(remove_splash),    )    degree_need = scrapy.Field(        input_processor = MapCompose(remove_splash),    )    job_type = scrapy.Field()    publish_time = scrapy.Field()    job_advantage = scrapy.Field()    job_desc = scrapy.Field()    job_addr = scrapy.Field(        input_processor=MapCompose(remove_tags, handle_jobaddr),    )    company_name = scrapy.Field()    company_url = scrapy.Field()    tags = scrapy.Field(        input_processor = Join(",")    )    crawl_time = scrapy.Field()    def get_insert_sql(self):        insert_sql = """            insert into lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need,            job_type, publish_time, job_advantage, job_desc, job_addr, company_name, company_url,            tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)            ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)        """        params = (            self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"],            self["work_years"], self["degree_need"], self["job_type"],            self["publish_time"], self["job_advantage"], self["job_desc"],            self["job_addr"], self["company_name"], self["company_url"],            self["job_addr"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT),        )        return insert_sql, params

在資料庫的設計

  

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

用python拉鉤網的完整版

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.