#-*-Coding:utf-8-*-import timeimport requestsimport refrom bs4 import beautifulsoup# get page def gethtmltext (URL): try: r = Requests.get (URL, timeout=30) r.raise_for_status () r.encoding = r.apparent_encoding retur n R.text except:return "" # Gets the top N pages and stores def getpage (n): # The file that stores the page f = "pages.html" fo = open (F, "w", Encoding = ' utf-8 ') # Default is GBK encoding, the following page content is decode Unicode encoding, will cause parsing for I in range (1,2*n+1,2): Time.sleep (0.5) Url= "http://search.jd.com/search?keyword=%e7%83%ad%e6%b0%b4%e5%99%a8&enc=utf-8&qrst=1&rt=1& stop=1&vt=2&wq=%e7%83%ad%e6%b0%b4%e5%99%a8&ev=exbrand_%e6%b5%b7%e5%b0%94%ef%bc%88haier%ef%bc%89%5e &stock=1&page= "+str (i) r=gethtmltext (URL) fo.write (r) # writes a string or byte stream fo to the file. Seek (2) # Point to end of File Fo.close () getpage (+) fo = open ("pages.html", "RT", encoding= "Utf-8") R=fo.read () fo.cl OSE () Suop=beautifulsoup (R, "Html.parser") print (Suop.Find_all ("a"))
Crawl a shopping site for a category of items and store multiple pages