ImportRequestsImportRe#TODO Download each novel's homepage URL#TODO Cycle#1. Download the novel homepageNovel_url ='http://www.jingcaiyuedu.com/book/15205/list.html'Response=requests.get (Novel_url)#handles the explicit designation of the character encoding,Response.encoding ='Utf-8'HTML= Response.text#string#print (HTML)#2. Extract Chapter URL non-greedy matchtitle = Re.findall (r'<meta name= "keywords" content= "(. *?) "', HTML) [0]#Print (title)#id = list DL has twoDL = Re.findall (r'<dl id= "list" >.*?</dl>', HTML) [1]#print (DL)Chapter_info_list = Re.findall (r'<a.*?href= "(. *?)". *?> (. *?) </a>', DL)#print (chapter_info_list)#data Persistence Write txtFB = open ('%s.txt'%title,'W', encoding='Utf-8')#3. Iterate through each chapter to extract the content forChapter_infoinchChapter_info_list:chapter_url=Chapter_info[0] Chapter_title= Chapter_info[1] #Handling Relative URLs if 'http' not inchChapter_url:chapter_url='http://www.jingcaiyuedu.com%s'%Chapter_url#Download Chapter PageChapter_response =requests.get (chapter_url) chapter_response.encoding="Utf-8"chapter_html=Chapter_response.text#print (Chapter_response.text) #Extract ContentChapter_content = Re.findall (r'<script>a1\ (\);</script> (. *?) <script>a2\ (\);</script>', chapter_html) [0]#clean up the data and dispose of the extra charactersChapter_content = Chapter_content.replace (' ',"') Chapter_content= Chapter_content.replace ('<br/>',"') Chapter_content= Chapter_content.replace ('<br>',"') Chapter_content= Chapter_content.replace (' ',"') #print (chapter_content) #Write Filefb.write (chapter_title) fb.write ('\ n') Fb.write (chapter_content) fb.write ('\ n') #chapter_response.close () Print(Chapter_url)#exit ()
Getting Started with Python crawlers