Python crawls Baidu cloud resources, python Baidu cloud
1 import urllib. request 2 import re 3 import random 4 5 def get_source (key): 6 7 print ('Please wait and crawl .... ') 8 headers = [{'user-agent': 'mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36 '}, {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv: 45.0) gecko/20100101 Firefox/45.0 "}, {'user-agent': 'mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0) '}] 9 10 header = random. choice (headers) # randomly select a header11 12 keyword = key. encode ('utf-8') 13 keyword = urllib. request. quote (keyword) # encode the keyword 14 15 # Here is the key, said a random search for a resource name, and then the results of the Web site analysis 16 url = "http://www.wangpansou.cn/s.php? Wp = 0 & ty = gn & op = gn & q = "+ keyword +" & q = "+ keyword17 req = urllib. request. request (url, headers = header) 18 19 html = urllib. request. urlopen (req) 20 21 # encoding type 22 head_type = html. headers ['content-type']. split ('=') [-1] 23 24 25 status = html. getcode () # obtain the status code. The status code continues only when the access is successful. 26 if status = 200:27 html = html. read () 28 html = html. decode (head_type) # decodes 29 30 based on the encoding line of the website # Regular Expression Match 31 pattern = re. compile ('<a href = "(. +) "> <div class =" cse-search-result_paging_num "tabindex =" \ d {1, 3} ">\d {1, 3} </div> </a> ') 32 content = pattern. findall (html) 33 34 url_list = [] 35 url_head = 'HTTP: // www.wangpansou.cn/'36 for I in content: 37 I = url_head + I # because only a portion of the matching results are available, so add the previous part and open the complete link 38 if not I N url_list: # remove duplicates. The webpage does have two points, so deduplication. 39 url_list.append (I) # obtain the URL list of all pages with Search Results 40 41 count = 1 # count with 42 for each_url in url_list: 43 header = random. choice (headers) # randomly select a 'head' for each link to prevent the server from blocking 44 request1 = urllib. request. request (each_url, headers = header) 45 html2 = urllib. request. urlopen (request1) 46 47 status = html2.getcode () # Get status code 48 if status = html2 = html2.read () 50 html2 = html2.decode (head _ Type) 51 pattern1 = re. compile ('<a class = ". + "href = "(. +) "rel. + ') 52 content1 = pattern1.findall (html2) 53 54 55 pattern2 = re. compile ('<div id = ". + "class =" cse-search-result_content_item_mid "> \ s + (. +) ') 56 content2 = pattern2.findall (html2) 57 58 59 for I in range (0, len (content2): 60 print (str (count) + ': '+ content2 [I] +' \ n' + content1 [I]) 61 print () 62 count + = 163 64 print ('% d resources are found in total, all crawlers have been crawled! '% Count) 65 66 if _ name _ =' _ main _ ': 67 get_source (input ('Enter the name of the resource to search :'))
"""
Note:
A. this search is actually a secondary search by searching the website through the online storage. If you are looking for resources, you can also search for a page by page on the online storage.
The only advantage of this script is that it crawls all the results at one time, without one page.
B. The code is ugly, but it is also a record of the learning process. First, implement the function and then consider the code.
"""