#!/usr/bin/python#-*-coding:utf-8-*-#爬取世纪佳缘#这个网站是真的烦, the beginning of the time with the Scrapy framework, but because of the new contact frame, hit the page after the JS rendering no way, so the use of general crawler#js渲染过的数据, there may be no data in the Web page source, the need for JS asynchronous request to extract data, and then display, so crawl this kind of data, just need to find the URL of JS to send the request on the line#js发送的请求可能是post (such as this example) or get (such as the chart of the Watercress movie plot), so be optimistic about what the requestImportSysImportJson#from lxml Import etreeImportUrllibImportUrllib2Reload(SYS) Sys.setdefaultencoding ("Utf-8")#设置默认的编码格式, the conversion of Unicode format to Chinese characters in the file is required, to cooperate with decode ("Unicode_escape") useclassShiji ():def __init__( Self): Self. page= 1 #初始化页面的第几页 Self. filename= Open("Sj.json","W")#打开文件用于写入数据 defParse_page ( Self): URL= "http://search.jiayuan.com/v2/search_v2.php?" #初始url的前半段, plus the second half sends a POST requestHeaders={"User-agent":"mozilla/5.0 (X11; Linux x86_64) applewebkit/537.36 (khtml, like Gecko) chrome/61.0.3163.100 safari/537.36 "} formdata={"Sex":"F","Key":"","STC":"1:11,2:20.25,3:160.170,23:1","SN":"Default","SV":"1","P": Self. Page,#每次数据js异步提取数据的时候只有p变化了, p corresponds to page number of the current page "F":"SELECT","ListStyle":"Bigphoto","Pri_uid":"170633820","Jsversion":"V5"} data=Urllib.urlencode (formdata) Request=Urllib2. Request (URL, data=Data, headers=Headers) Response=Urllib2.urlopen (Request)# Print Response.info (). Get (' content-encoding ') #这会打印出服务器返回数据的压缩方式, returns none if no compression is availableJs=Response.read ()#返回的response. Read () is in JSON format, but the first and the end of a string "# #jiayse # #" and "# #jiayse # #", so the following to deal with #print Type (JS) #字符串类型Js=Js.replace ("# #jiayser # #",""). Replace ("//","")#字符串里很牛X的一个函数replace () #print JSContent=Json.loads (JS)#字典类型 (This dictionary is a large dictionary, key only IsLogin, Count, Pagetotal, UserInfo, where UserInfo is everyone's data) Self. Parse_person (content[' UserInfo '])#调用parse_person函数处理大字典的ueerInfo (or dictionary type), that is, everyone's profile# Print type (content[' UserInfo ') #字典类型 defParse_person ( Self, userinfo): forIinch Range(Len(userinfo)): Form={"nickname": userinfo[i][' nickname '],"Age": userinfo[i][' age '],"Inner Monologue": userinfo[i][' Shortnote ']}#把要爬取的数据写成一个字典 #print Form #print type (form) #不能把dict和list类型直接写入文件, when you write a dictionary to a JSON file, you convert the dictionary to a JSON string (Json.dumps (form)) . #decode ("Unicode_escape") means converting the Unicode format to Chinese characters, but with one it's still not working! Add the Import sys ..., it's too much trouble.Form=Json.dumps (Form). Decode ("Unicode_escape") Self. Filename.write (Form+"\ n")#写入文件 if Self. page<Ten:#这里随便写self. Page, as long as the page load range is not exceeded Self. page+=1 Self. Parse_page ()#自加一之后在调用parse_page () function to change pages and then crawlif __name__ == "__main__": SJ=Shiji () Sj.parse_page ()#调用parse_page () function starts crawling
‘ascii‘ codec can‘t encode characters in position问题: https://www.cnblogs.com/technologylife/p/6071787.html http://blog.sina.com.cn/s/blog_64a3795a01018vyp.html把dict写入文件的时候碰见的报的typeError的解决办法: http://blog.csdn.net/guoweish/article/details/47106263另外加一篇ubuntu vim撤销操作的博客 http://blog.sina.com.cn/s/blog_7e9efc570101ays3.html收获:这次的收获还可以,解决了很多没见过的bug,第一次爬取js渲染的网页的数据,值得记得的是:(1).js渲染过得网页怎么找数据来源(f12 network XHR 找是post请求还是get请求),(2)字符串的强大替换函数replace,(3)字典写入文件怎么处理 (先转换成json的字符串再写入),(4)unicode转汉字怎么处理(import sys + decode("unicode_escape"))
Python crawls the best of the century and crawls the pages that have been rendered by JS