"Baidu URL Collector"

Source: Internet
Author: User
# coding = utf-8 # __author__ = Christopher from urllib import request, parse from BS4 import beautifulsoup import re Imp ORT OS # from time import strftime class Spider:def __init__ (self): Self.header = {} Self.url = ' htt p://www.baidu.com/s?wd= ' self.page = 0 Self.word = ' inurl:action ' # here set what you want to search for ' self ' . Hrefre = Re.compile (' (Http|ftp|https) ': \/\/[\w\-_]+ (\.[ \w\-_]+) + ([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#]) ' Def encodingwd (self, WD): Return Parse.quote 
        (WD) def struct_url (self, Search_word, page=0): url = self.url if Search_word = = ': Pass Else:self.url = Search_word if page!= 0:url = URL + self. ENCODINGWD (Self.word) + ' &pn= ' + str (page) + ' 0 ' else:url = self. ENCODINGWD (self.word) return URL def spider (Self, page, Search_word): self.header[' UsEr-agent '] = ' mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) gecko/20100101 firefox/47.1 ' Auth_url_lis = [] Baidu_search_url = (Self.struct_url (Search_word, page)) req = Request.  Request (Baidu_search_url, headers=self.header) RSP = Request.urlopen (req) HTML = rsp.read () soup = BeautifulSoup (HTML, ' Html.parser ') # auth_url_lis = [] for URL in Soup.find_all (' a ', {' class ': ' C-showurl '}]: Auth_url_lis.append (url.get (' href ')) # Soupresult.append (Soup.findall (' a ', {' class ': ' C-showu RL '}) # Time.sleep (8) return Auth_url_lis ' Def extract_href (self, Waitforre): HRE fre = Self.hrefre result = [] Base_url = ' http://www.baidu.com/' to I in Re.findall (Hrefre, WAITF Orre): Result.append (base_url+i[2]) return to result ' Def auth_url (self, crypt_url): Resul t = [] # header = {} # header[' user-agent '] = ' Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) gecko/20100101 firefox/47.1 ' try:req = Request.
        Request (Crypt_url, headers=self.header) url = request.urlopen (req) result.append (Url.geturl ()) Except:print (' [*]can\ ' t get auth_url!%s '% str (CRYPT_URL)] Return to result def main (): path = OS.GETCWD () path = ' \\auth_url.txt ' # get script current path Spi_ob = Spider () file = open (path, ' a ') # for K in Ran GE (0, 5): Print (' [#]version 0.3\n[#]__author__=christopherlam\n[#]qq:770304694 ', end= ' \ n ') Search_word = str (inpu T (' [*] Please enter search keywords (not filled): ') subscript_page = Int (' [*] Please enter the lower limit (0 is the first page): ') superscript_page = Int (' [*] Please enter the upper page number : ') print (' [*]spider is under running ... ') for K in range (Subscript_page, superscript_page): Auth_u Rl_lis = Spi_ob.spider (page=k, search_word=search_word) while Auth_url_lis:url_result = Spi_ob.auth_u
            RL (Auth_url_lis.pop ())While Url_result:file.write (Url_result.pop () + ' \ n ') File.close () print ("*]success.

 Quit ... ') # for I in Re_result: # print (i) if __name__ = = ' __main__ ': Main ()


Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.