This article mainly introduces how Python crawl sogou public article permanent link of the idea of the analysis, small series feel very good, and now share to everyone, but also for everyone to do a reference. Follow the small series together to see it.
This article mainly explains the idea, the code part please self-solve
Sogou search for public numbers and articles
Get permanent links through the public platform
Python+scrapy Frame
MySQL Database store + read public number
Get the information ranking on the day of Sogou
Specify input keywords to crawl public numbers via Scrapy
Get cookie information by logging in to the public number link
Because the analog landing public platform has not been resolved, so need to manually login, real-time access to cookie information
Here you can make a permanent link change
Code section
Def parse (self, Response): item = Sougouitem () item["title"] = Response.xpath ('//title/text () '). Extract_ First () print ("* *", item["title"], "* *" ("*") name = input ("----------Please enter the information you need to search:") print (name) URL = "http://weixin.sogou.com/weixin?query=" +name+ "&type=2&page=1&ie=utf8" yield scrapy. Request (Url=url, Callback=self.parse_two, meta={"name": Name})
Sogou will appear too fast access, resulting in the need to enter a verification code
def parse_two (self, Response): print (response.url) name = response.meta["Name"] resp = Response.xpath ('//ul[@ class= "news-list"]/li ') s = 1 # Determine if the URL is required to enter the CAPTCHA res = Re.search ("from", Response.url) # requires verification code verification if re S:print (response.url) img = Response.xpath ('//img/@src '). Extract () Print (img) url_img = " http://weixin.sogou.com/antispider/"+ img[1] Print (url_img) url_img = Requests.get (url_img). Content With open ("Urli.jpg", "WB") as F:f.write (url_img) # f.close () img = input ("Please enter the CAPTCHA:") Print (img) url = response.url r = Re.search (r "from= (. *)", URL). Group (1) print (R) post Data = {"c": IMG, "R": R, "V": "5"} url = "http://weixin.sogou.com/antispider/thank.php" yield scrapy. Formrequest (Url=url, Formdata=postdata, callback=self.parse_two,meta={"name": name}) # no verification code validation else is required: For res, I in Zip (resp, ranGE (1): item = Sougouitem () item["url"] = Res.xpath ('.//p[1]/a/@href '). Extract_first () item["name"] = name print ("article%d"% i) # convert Permalink headers = {"Host": "Mp.weixin.qq.c Om "," Connection ":" Keep-alive "," Accept ":" Applic Ation/json, Text/javascript, */*; q=0.01 "," X-requested-with ":" XMLHttpRequest "," Us Er-agent ":" mozilla/5.0 (Windows NT 10.0; WOW64) applewebkit/537.36 (khtml, like Gecko) chrome/65.0.3325.181 safari/537.36 "," Re Ferer ":" https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10 &token=938949250&lang=zh_cn "," accept-encoding ":" gzip, deflate, BR ", "Accept-language": "zh-cn,zh;q=0.9", "Cookie": "Noticeloginflag=1; pgv_pvi=5269297152; pt2gguin=o1349184918; RK=PH4SMY/QWU; PTCZ=F3EB6EDE5DB921D0ADA7F1713E6D1CA516D200FEC57D602E677245490FCB7F1E; pgv_pvid=1033302674; o_cookie=1349184918; pac_uid=1_1349184918; ua_id=4noosvhnktojpipgaaaaafx9osnclapfsluzwfcllw8=; MM_LANG=ZH_CN; noticeloginflag=1; remember_acct=liangkai318; rewardsn=; wxtokenkey=777; pgv_si=s1944231936; uuid=700c40c965347f0925a8e8fdcc1e003e; Ticket=023fc8861356b01527983c2c4765ef80903bf3d7; Ticket_id=gh_6923d82780e4; Cert=l_ce4ardazednzao3xebmkcp3kwuejoi; data_bizuin=3075391054; bizuin=3208078327; Data_ticket=xrzonrv9odc80hjltk8vfjtli1vd7kfkj9u+dzvaeehxzkmxbv9kcwk/pmqx/9g7; Slave_sid= Swrknmfyz1nkm002rk9nr0rrvgy5vfdmd1lxskexwgtpcwjarekzq1besecyqknlvlq3ynb4ofnonmtrzzdfdgpnvglhak9lmjj5exbnvegxzdlzb1bzmnlfn 1hkdnjsv0nkallsqw91zjk5y3prvjlqrdngyudguwnfned6etryt1fsoeqxt0mwr01ja0vo; Slave_user=gh_6923d82780e4; xid=7b2245140217dbb3c5c0a552d46b9664; openid2ticket_otr5ot_b4nrdsj14zuxlxg8yrzws=d/b6//xk73Boo+mke2eajdcgixnpw/b5pedtdwm6t+4= "} respon = Requests.get (url=item[" url "]). Content Gongzhongh = etree. HTML (respon). XPath ('//a[@id = ' Post-user ']/text () ') [0] # times = etree. HTML (respon). XPath ('//*[@id = ' post-date ']/text () ') [0] Title_one = etree. HTML (respon). XPath ('//*[@id = ' activity-name ']/text () ') [0].split () [0] Print (Gongzhongh, title_one) item["tit"] = Tit Le_one item["Gongzhongh"] = Gongzhongh # item["times"] = times URL = "HTTPS://MP.WEIXIN.QQ.COM/CG I-bin/searchbiz?action=search_biz&token=938949250&lang=zh_cn&f=json&ajax=1&query= "+ Gongzhongh + "&begin=0&count=5" # Wenzhang_url = "https://mp.weixin.qq.com/cgi-bin/appmsg?token=610084158& Amp;lang=zh_cn&f=json&ajax=1&random=0.7159556076774083&action=list_ex&begin=0&count=5 &query= "+ item[" tit "] +" &fakeid=mza5mzmxmdk3oq%3d%3d&type=9 "resp = Requests.get (Url=url, Headers=header s). Content print (RESP) Faskeids = Json.loads (Resp.decode ("Utf-8")) Try:list_fask = faskeids["List"] Exc EPT Exception as F:print ("**********[info]: Request failed, Login failed, please re-login *************") return for Fask in List_fask:fakeid = fask["Fakeid"] Nickname = fask["nickname"] if Nicknam E = = item["Gongzhongh"]: url = "Https://mp.weixin.qq.com/cgi-bin/appmsg?token=938949250&f=json& ; action=list_ex&count=5&query=&fakeid= "+ Fakeid +" &type=9 "# url =" Https://mp.weixi N.qq.com/cgi-bin/appmsg?token=1773340085&lang=zh_cn&f=json&ajax=1&action=list_ex&begin=0 &count=5&query= "+ item[" tit "] +" &fakeid=mza5mzmxmdk3oq%3d%3d&type=9 "url =" https:// Mp.weixin.qq.com/cgi-bin/appmsg?token=938949250&f=json&ajax=1&action=list_ex&begin=0&count =5&query= "+ item[" tit "] +" &fakeid= "+ FAKEid + "&type=9" resp = Requests.get (Url=url, headers=headers). Content app = Jso N.loads (Resp.decode ("Utf-8")) ["App_msg_list"] item["aid"] = app["aid"] item["Appms GID "] = app[" Appmsgid "] item[" cover "] = app[" Cover "] item[" digest "] = app[" Digest " ] item["url_link"] = app["link"] item["tit"] = app["title" PR Int (item) Time.sleep (+) # Time.sleep (5) # Dict_wengzhang = JS On.loads (Resp.decode ("Utf-8")) # app_msg_list = dict_wengzhang["App_msg_list"] # p Rint (Len (app_msg_list)) # for Apps in App_msg_list: # print (APP) # title = app["title"] # if title = = item["Tit"]: # item["ur L_link "] = app[" link "] # updata_time = app["Update_time"] # item["times"] = Time.strftime ( "%y-%m-%d%h:%m:%s", Updata_time) # print ("Final link is:", item["Url_link"]) # Yield Item # Else: # print (app["title"], item["tit"]) # print ("Discard with Selected article") # # item["tit"] = app["title"] # # item["Url_link"] = app["link"] # # Yield Item # Else: # print (nickname, item["Gongzhongh"]) # print ("Discard with selected public number") # Time.sleep (100) # Yield Item if Response.xpath ('//a[@class = "NP"] '): s + = 1 URL = "/http/ weixin.sogou.com/weixin?query= "+name+" &type=2&page= "+str (s) # Time.sleep (3) yield Scrapy. Request (Url=url,Callback=self.parse_two, meta={"name": Name})