I would like to analyze the Internet industry recruiting interns nationwide, by crawling Zhaopin, get 15,467 data, and import MySQL
In the items.py:
ImportScrapy fromScrapy.httpImportRequest fromlxmlImportEtree fromZhaopinzhilian.itemsImportZhaopinzhilianitemclassRecuritspider (scrapy. Spider): name =' Recurit ' Allowed_domains = [' zhaopin.com '] #start_urls = [' http://www.zhaopin.com/'] Header = {"User-agent":"mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/59.0.3071.115 safari/537.36 "}defStart_requests (self): return[Request ("http://sou.zhaopin.com/jobs/searchresult.ashx?bj=5006000&sj=299&in=210500%3b160400%3b160000% 3b160500%3b160200%3b300100%3b160100%3b160600&jl=%e9%80%89%e6%8b%a9%e5%9c%b0%e5%8c%ba&sb=2&sm=0 &isfilter=0&fl=489&isadv=0&sg=2b24ff0c4e924139b8749ea5a59d2dbb&p=1 ", Callback=self.parse, Headers=self.header, dont_filter=True )]defParse (self, Response):Try: item = Zhaopinzhilianitem () data = response.text res = etree. HTML (data) table_list = Res.xpath ('//table[@class = ' newlist '] ') forTable intable_list:item["link"]= Table.xpath ('.//td[@class = "ZWMC"]//a[1]//@href ') forJ inRange (0, Len (item["link")): surl=item["link"][J] Print (SURL)yieldRequest (Surl,callback=self.next) forI inRange (2, $): url ="http://sou.zhaopin.com/jobs/searchresult.ashx?bj=5006000&sj=299&in=210500%3b160400%3b160000% 3b160500%3b160200%3b300100%3b160100%3b160600&jl=%e9%80%89%e6%8b%a9%e5%9c%b0%e5%8c%ba&sb=2&sm=0 &isfilter=0&fl=489&isadv=0&sg=2b24ff0c4e924139b8749ea5a59d2dbb&p= "+ STR (i)yieldRequest (URL, callback=self.parse)exceptException asE:print (e)defNext (Self,response):Try:''' conn = Pymysql.connect (host= "127.0.0.1", user= "root", passwd= "root", db= "Zhilian", charset= " UTF8")cursor = conn.cursor () ''' item = Zhaopinzhilianitem () item["ZWMC"]=response.xpath ("//div[@class = ' inner-left fl ']/h1/text ()"). Extract () item["GSMC"] = Response.xpath ("//div[@class = ' inner-left fl ']/h2/a[@target = ' _blank ']/text ()"). Extract () res = etree. HTML (Response.text) item["GSGM"]= Res.xpath ("/html/body/div[6]/div[2]/div[1]/ul/li[1]/strong/text ()") Zwyx = Res.xpath ("/html/body/div[6]/div[1]/ul/li[1]/strong/text ()") item["Zwyx"] = [Zwyx[0].replace (u ' yuan/month\xa0',u "")] #print (item["Zwyx"]) item["Gzdd"] = Res.xpath ("/html/body/div[6]/div[1]/ul/li[2]/strong/a/text ()") zprs= Res.xpath ("/html/body/div[6]/div[1]/ul/li[7]/strong/text ()") item["Zprs"]=[zprs[0].replace (u ' People ',u "")] Item["Minxueli"] = Res.xpath ("/html/body/div[6]/div[1]/ul/li[6]/strong/text ()")''' sql = "INSERT into Zhaopin (Zwmc,gsmc,zwyx,zprs,gzdd,gsgm,minxueli) values (%s,%s,%s,%s,%s,%s,%s);"params = (item["ZWMC"][0), item["GSMC"][0], item["Zwyx"][0],item["Zprs"][0],item["Gzdd"][0],item["GSGM"][0 ],item["Minxueli"][0])cursor.execute (SQL, params)Conn.commit ()cursor.close ()conn.close () ''' yieldItemexceptException asE:print (e)
Then, in pipelines, the data is imported into the database operation:
ImportPymysqlclassZhaopinzhilianpipeline (object):defProcess_item (self, item, spider): conn = Pymysql.connect (host="127.0.0.1", user="Root", passwd="Root", db="Zhilian", charset="UTF8") cursor = Conn.cursor () forI inRange (0, Len (item["ZWMC")): zwmc=item["ZWMC"][i] gsmc=item["GSMC"][i] zwyx=item["Zwyx"][i] gzdd=item["Gzdd"][i] gsgm=item["GSGM"][i] minxueli=item["Minxueli"][i] zprs=item["Zprs"][i] sql ="INSERT into Zhaopin (Zwmc,gsmc,zwyx,zprs,gzdd,gsgm,minxueli) values (%s,%s,%s,%s,%s,%s,%s);" params = (Zwmc,gsmc,zwyx,zprs,gzdd,gsgm,minxueli) cursor.execute (sql,params) Conn.commit () Cursor.close () Conn.close () returnItem
Finally remember to open the Piplines in settings.py:
Item_pipelines = {
' zhaopinzhilian.pipelines.ZhaopinzhilianPipeline ': +,
}