#Author: Mini
#! /usr/bin/env python
Import re
Import Urllib.request
Import time
Import Urllib.error
def use_proxy (Proxy_addr,url):
Try
Req=urllib.request.request (URL)
Req.add_header ("User-agent", "mozilla/5.0" (Windows NT 10.0; Win64; x64; rv:56.0) gecko/20100101 firefox/56.0 ")
Proxy=urllib.request.proxyhandler ({' http ':p roxy_addr})
Opener=urllib.request.build_opener (Proxy,urllib.request.httphandler)
Urllib.request.install_opener (opener)
Data=urllib.request.urlopen (req). Read ()
Return data
Except Urllib.error.URLError as E:
If Hasattr (E, "code"):
Print (E.code)
If Hasattr (E, "Reason"):
Print (E.reason)
Time.sleep (1)
key= "Galaxy Macau"
Key=key.replace ("", "+")
Proxy= "127.0.0.1:8888"
For I in range (1,100):
Key=urllib.request.quote (Key)
Thispageurl= "http://weixin.sogou.com/weixin?oq=&query=" +key+ "&_sug_type_=1&sut=0&lkt=0%2c0%2c0 &s_from=input&ri=0&_sug_=n&type=2&sst0=1507173318682&page= "+str (i) +" &ie=utf8&p =40040108&dp=1&w=01015002&dr=1 "
Thispagedata=use_proxy (Proxy,thispageurl)
Print (Len (str (thispagedata)))
pat1= ' <a target= "_blank" href= "(. *?)" '
Res1=re.compile (Pat1,re. S). FindAll (str (thispagedata))
if (Len (res1) ==0):
Print (str (i) + "\t\tfail!")
Continue
For j in Range (0,len (res1)):
THISURL=RES1[J]
Thisurl=thisurl.replace ("amp;", "")
File= "e:/m/" +str (i) + "\tweb\t\t" +str (j) + "\tessay.html"
Thisdata=use_proxy (Proxy,thisurl)
Try
Fh=open (file, "WB")
Fh.write (Thisdata)
Fh.close ()
Print (str (i) + "\tweb\t\t" +str (j) + "\tessay\tsuccess!")
Except Exception as E:
Print (e)
Print (str (i) + "\tweb\t\t" +str (j) + "\tessay\tfail!")
Web crawling (PLUS5) crawling WeChat