# coding = utf-8 # __author__ = Christopher from urllib import request, parse from BS4 import beautifulsoup import re Imp ORT OS # from time import strftime class Spider:def __init__ (self): Self.header = {} Self.url = ' htt p://www.baidu.com/s?wd= ' self.page = 0 Self.word = ' inurl:action ' # here set what you want to search for ' self ' . Hrefre = Re.compile (' (Http|ftp|https) ': \/\/[\w\-_]+ (\.[ \w\-_]+) + ([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#]) ' Def encodingwd (self, WD): Return Parse.quote
(WD) def struct_url (self, Search_word, page=0): url = self.url if Search_word = = ': Pass Else:self.url = Search_word if page!= 0:url = URL + self. ENCODINGWD (Self.word) + ' &pn= ' + str (page) + ' 0 ' else:url = self. ENCODINGWD (self.word) return URL def spider (Self, page, Search_word): self.header[' UsEr-agent '] = ' mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) gecko/20100101 firefox/47.1 ' Auth_url_lis = [] Baidu_search_url = (Self.struct_url (Search_word, page)) req = Request. Request (Baidu_search_url, headers=self.header) RSP = Request.urlopen (req) HTML = rsp.read () soup = BeautifulSoup (HTML, ' Html.parser ') # auth_url_lis = [] for URL in Soup.find_all (' a ', {' class ': ' C-showurl '}]: Auth_url_lis.append (url.get (' href ')) # Soupresult.append (Soup.findall (' a ', {' class ': ' C-showu RL '}) # Time.sleep (8) return Auth_url_lis ' Def extract_href (self, Waitforre): HRE fre = Self.hrefre result = [] Base_url = ' http://www.baidu.com/' to I in Re.findall (Hrefre, WAITF Orre): Result.append (base_url+i[2]) return to result ' Def auth_url (self, crypt_url): Resul t = [] # header = {} # header[' user-agent '] = ' Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) gecko/20100101 firefox/47.1 ' try:req = Request.
Request (Crypt_url, headers=self.header) url = request.urlopen (req) result.append (Url.geturl ()) Except:print (' [*]can\ ' t get auth_url!%s '% str (CRYPT_URL)] Return to result def main (): path = OS.GETCWD () path = ' \\auth_url.txt ' # get script current path Spi_ob = Spider () file = open (path, ' a ') # for K in Ran GE (0, 5): Print (' [#]version 0.3\n[#]__author__=christopherlam\n[#]qq:770304694 ', end= ' \ n ') Search_word = str (inpu T (' [*] Please enter search keywords (not filled): ') subscript_page = Int (' [*] Please enter the lower limit (0 is the first page): ') superscript_page = Int (' [*] Please enter the upper page number : ') print (' [*]spider is under running ... ') for K in range (Subscript_page, superscript_page): Auth_u Rl_lis = Spi_ob.spider (page=k, search_word=search_word) while Auth_url_lis:url_result = Spi_ob.auth_u
RL (Auth_url_lis.pop ())While Url_result:file.write (Url_result.pop () + ' \ n ') File.close () print ("*]success.
Quit ... ') # for I in Re_result: # print (i) if __name__ = = ' __main__ ': Main ()