Python one-day training 05 ---- furious clicks, python05 ----
Function
Automatically obtain the CSDN article list and add clicks for each article.
Source code
Import urllib. requestimport reimport timeimport randomfrom bs4 import BeautifulSoupp = re. compile ('/a359680405/article/details /........ ') # Your own blog homepage url = "http://blog.csdn.net/a359680405" # using build_opener () is to let the python program mimic the browser to access opener = urllib. request. build_opener () opener. addheaders = [('user-agent', 'mozilla/100')] html = opener. open (url ). read (). decode ('utf-8') allfinds = p. findall (html) print (allfinds) urlBase = "http://blog.csdn.net" # part of the URL that needs to be merged # The URL on the page is duplicated, use set to repeat mypages = list (set (allfinds) for I in range (len (mypages )): mypages [I] = urlBase + mypages [I] print ('the page to be refreshed is: ') for index, page in enumerate (mypages): print (str (index ), page) # set the number of times each page needs to be refreshed brushMax = 200 # print ('click here to brush it: ') for index, page in enumerate (mypages ): brushNum = random. randint (0, brushMax) for j in range (brushNum): try: pageContent = opener. open (page ). read (). decode ('utf-8') # Use BeautifulSoup to parse the title soup = BeautifulSoup (pageContent) blogTitle = str (soup. title. string) blogTitle = blogTitle [0: blogTitle. find ('-')] print (str (j), blogTitle) failed t urllib. error. HTTPError: print ('urllib. error. HTTPError ') time. sleep (1) # an error occurs. wait a few seconds before calling urllib. error. URLError: print ('urllib. error. URLError ') time. sleep (1) # an error occurs. wait a few seconds for the first time. sleep (0.1) # normal pause to avoid server access rejection
Zookeeper