1 #-*-coding:utf-8-*-2 ImportUrllib23 fromBs4ImportBeautifulSoup4 5 classDbxs:6 7 def __init__(self):8Self.pageindex =09Self.enable =TrueTenSelf.file =None OneSelf.content = [] A - - #get the contents of an HTML page the defgetpage (Self, pageIndex): - Try: - #Set Proxy IP -Enable_proxy =True +Proxy_handler = Urllib2. Proxyhandler ({'Http':'113.118.170.230:808'}) -Null_proxy_handler =Urllib2. Proxyhandler ({}) + ifEnable_proxy: AOpener =Urllib2.build_opener (Proxy_handler) at Else: -Opener =Urllib2.build_opener (Null_proxy_handler) - Urllib2.install_opener (opener) - #get the content of the page response -URL ='Https://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book'+"? start="+Str (pageIndex) - #set the request header information to simulate browser behavior inMy_headers = {'user-agent':'mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0)'} -Request = Urllib2. Request (url, headers =my_headers) toResponse =Urllib2.urlopen (Request) + returnResponse.read () - exceptUrllib2. Urlerror, E: the ifHasattr (E,"Code"): * PrintE.code $ ifHasattr (E,"reason"):Panax Notoginseng PrintE.reason - returnNone the + #Filter Find this page of novel names, information and ratings A defgetcontent (self, PageIndex, content): thePagecode =self.getpage (PageIndex) +Soup = BeautifulSoup (Pagecode,'Html.parser') - #find out what's tagged as <dd> in the appropriate content (it contains the novel information we need) $Contents = Soup.find_all ('DD') $ - ifContents: - forIteminchContents: thetitle = Item.find (Class_ ='title'). String.encode ('Utf-8') -info = item.find (Class_ ='desc'). String.strip (). Encode ('Utf-8')WuyiRate = Item.find (Class_ ='rating_nums') the #through the experiment, we found that a certain page may have a novel without scoring, if we do not judge rate, then there may be an error - ifRate : WuRates = Rate.string.encode ('Utf-8') - Content.append ([title, info, rates]) About $ Else: - Content.append ([title, info]) - #If the page does not contain <dd> tags, we should stop - Else: A PrintU"All pages are finished loading" +Self.enable =False the - returncontent $ the the the #Write File the defWriteData (self, content): -Self.file = open ("Bdxs.txt","w+")#must be out of the for loop, or the previous data will be overwritten with each write in forIteminchcontent: the ifLen (item) = = 3: theSelf.file.write (Item[0] +"\ n") AboutSelf.file.write (Item[1] +"\ n") theSelf.file.write (U"Score:"+ item[2] +"\ n") the Else: theSelf.file.write (Item[0] +"\ n") +Self.file.write (Item[1] +"\ n") -Self.file.write ("========================================\n\n") the Bayi the #Create a Start method the defStart (self): -x = 1 - whileSelf.enable = =True: theContent =self.getcontent (Self.pageindex, self.content) the ifSelf.enable = =True: the Print "writing page%s ..."%x the self.writedata (content) -Self.pageindex + = 15 thex + = 1 the the 94Dbxs =Dbxs () theDbxs.start ()
This code I do not understand thoroughly, such as every page of the novel after the completion of the writing, how to add the next page, later I will continue to improve it.
Python crawler using BeautifulSoup to crawl the watercress novel (III.)--writing the novel information to a file