#
#author: Wuhao
#
#爬取指定页码的图片, if you need to crawl all the pictures of a certain category, the overall frame will not change, but you need to analyze it separately
#
Import Urllib.request
Import Urllib.parse
Import re
Import OS
#添加header
Header=\
{ 'user-agent':'mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/56.0.2924.87 safari/537.36', 'Accept':'Text/plain, */*; q=0.01', 'Accept-language':'zh-cn,zh;q=0.8', 'X-requested-with':'XMLHttpRequest', 'Connection':'keep-alive', "Cookies":"__cfduid=d9625fabd71ff1ba94d67ec814e05e1571473486956; bduss=4tcgmxyzd-dm94zzhzatnqenptwvlpwffgeuh5ogr1ztrfmnhwnnrjekdev0zzsvfbqufbjcqaaaaaaaaaaaeaaacevjl7vvtyvmq~ 2lwaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaamaaovjggdlyy2; baiduid=d6e9d0b92f3f35b1e4a7ad3cabcbaed3:fg=1; bidupsid=ff63886073870ca5abc6ea2663979765; pstm=1482469381; indexpagesuglist=%5b%22king%20of%20the%20kill%e5%a4%b4%e5%83%8f%22%2c%22%e5%88%9b%e6%84%8f%e6%91%84%e5%bd%b1% 22%2c%22%e9%ab%98%e6%b8%85%e6%91%84%e5%bd%b1%22%2c%22test%22%2c%22s%20logo%22%5d; bdorz=b490b5ebf6f3cd402e515d22bcda1598; bdrcvfr[x_xkqks0s63]=mk3slvn4hkm; Firstshowtip=1; bdrcvfr[dg2jnjb_ajr]=mk3slvn4hkm; bdrcvfr[-pgxjrcmryr]=mk3slvn4hkm; Userfrom=null", "Referer":"Https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1 &fm=result&fr=&sf=1&fmq=1490169358952_r&pv=&ic=0&nc=1&z=&se=1&showtab= 0&FB=0&WIDTH=&HEIGHT=&FACE=0&ISTYPE=2&IE=UTF-8&WORD=%E7%BE%8E%E5%A5%B3" }
#url是分析后得来的, the JSON data is pointed to
Url=
" Https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result &queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0& Word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr= &cg=girl&pn={pagenum}&rn=30&gsm=1e00000000001e&1490169411926="
#keyword =input ("Please enter search keywords:")
Keyword= ' Beauty '
#转换编码格式
Keyword=urllib.parse.quote (keyword, "utf-8")
N=0
J=0
Error=0
#获取前3000张图片
while (n<30*100):
n+=30 #url链接 url1=url.format (WORD=KEYWORD,PAGENUM=STR (n)) #获取请求 rep=urllib.request.request ( Url1,headers=header) #打开网页 Rep=urllib.request.urlopen (Rep) #读取网页数据 try: html=rep.read (). Decode ("Utf-8") except: print ("Something wrong!") Error=1 Print ("-------------now page =" +STR (n)) if (error==1): Continue #正则匹配 p=re.compile (" Thumburl.*?\.jpg ") #获取正则匹配结果, returns a list S=p.findall (HTML) #写入文件夹 if Os.path.isdir (" f:\\ Myproject\\mycrawlpic\\ beauty ")!=true: os.makedirs (r" f:\\myproject\\mycrawlpic\\ beauty ") with open (" TestPic1.txt "," W ") as F: For I in S: i=i.replace (" thumburl\ ": \" "," ") print (i) f.write (i) F.write ("\ n") Urllib.request.urlretrieve (i, "f:\\myproject\\mycrawlpic\\ Big breasted \\pic{num}.jpg". Format (NUM=J)) j+=1
Python crawler crawl Baidu image