Then write a tool that uses BeautifulSoup to grasp the station, realize BeautifulSoup's powerful.
According to the novel index page to obtain the entire chapter content of the novel and integrated into the novel in the whole. But not smart, different sites on the code needs to make corresponding changes.
#!/usr/bin/env pythonImportOSImportSYSImportReImport TimeImportChardetImporturllib.request as ur fromUrllib.parseImportUrljoin,urlparse fromBs4ImportBeautifulSoup fromThreadingImportThreadclassDownload (Thread): #为每个章节分配多线程def __init__(self,filepath,info): Thread.__init__(self) Self.filepath=filepath (self.link,self.chapter)=InfodefRun (self):Print('To start the download:'+self.chapter) Section (Self.filepath,self.chapter,self.link)Print('complete the Download:'+self.chapter)defgetData (URL): #主要用于判断页面编码, but found that beautifulsoup own judgment ability, so discard this function charsets='UTF8'Response= Ur.urlopen (Url,timeout = 10) HTML=response.read () charinfo=chardet.detect (HTML) charsets= charinfo['encoding'] Data=Html.decode (charsets)returnDatadefmerge (Tmpfiles,targetfile): #将下载的章节合并 forTmpfileinchTmpfiles:with Open (TargetFile,'A +') as Wfile:wfile.write (open (Tmpfile,'R'). Read ()) Os.remove (tmpfile)defcontent (link): #获取章节页面的小说内容. For different sites, modify the code HTML that gets the chapter content within this function= Ur.urlopen (Link,timeout = 10) Soup=BeautifulSoup (HTML) Contents= Soup.find (id ='ReadText'). P.span.text.replace (' ','\ n') #BeautifulSoup会自动将 Convert to a space,<br/> convert to a special symbolreturnContentsdefSection (filepath,chapter,link): #下载章节内容 whileTrue: #反复请求页面Try: With open (filepath,'W') as Nfile:nfile.write (chapter+'\ n'+content (link) +'\ n') Break except: Pass defindex (URL): Indexs= [] whileTrue: #反复请求页面Try: HTML= Ur.urlopen (Url,timeout = 10) #html = Html.read (). Decode (' gb2312 ') #html = getData (URL)Soup = beautifulsoup (html,from_encoding ='GBK') #BeautifulSoup能自动识别编码, but the GBK page is recognized as a gb2312 page, which may result in partial data acquisition failure in the page Break except: Passtitle= Soup.find (name ='Div', Attrs = {'class':'Booktext'}). Text Indexdiv= Soup.find (name ='Div', Attrs = {'class':'Booktext'}) Indexul= [ul forUlinchIndexdiv.find_all ('ul')ifUl][1:] forUlinchindexul:indexlist= [LI.A forLiinchUl.find_all ('Li')ifLi] Index= [(Urljoin (Url,a.get ('href')), A.text) forAinchIndexlistifA] indexs+=IndexreturnIndexsdefnovel (URL): Tmpfiles=[] Tasks= [] Try: Indexs=index (URL) tmpDir= Os.path.join (OS.GETCWD (),'tmp') if notos.path.exists (tmpDir): #创建章节片段存放的临时目录 os.mkdir (TmpDir) forI,infoinchEnumerate (indexs): Tmpfile=Os.path.join (Tmpdir,str (i)) Tmpfiles.append (tmpfile) Task=Download (tmpfile,info) #开启新线程下载章节内容 Task.setdaemon (True) Task.start () Tas Ks.append (Task)ifLen (tasks) >= 20: #将线程总数控制在20个以内, if too many threads can cause a program to crash whileLen ([Task forTaskinchTasksiftask.isalive ()]):Print('progress: {}/{}'. Format (I+1-len ([Task forTaskinchTasksiftask.isalive ()]), Len (Indexs))) #显示下载进度 Time.sleep (2) Tasks= [] ifi = = Len (indexs)-1: whileLen ([Task forTaskinchTasksiftask.isalive ()]):Print('progress: {}/{}'. Format (len (indexs)-Len ([Task forTaskinchTasksiftask.isalive ()]), Len (Indexs)) Time.sleep (2) Print('progress: {}/{}'. Format (len (Indexs), Len (Indexs)))Print('Start Integration ...') Merge (Tmpfiles,os.path.join (OS.GETCWD (), title+'. txt')) Print('Download Successful! ') exceptException as ex:Print(ex)Print('Download failed! ') Sys.exit ()defMain (argv):Try: Novel (argv[0])exceptKeyboardinterrupt as Kbi: Downloaded chapters can still be merged after #使用 <C-c> interrupted download TmpDir= Os.path.join (OS.GETCWD (),'tmp') ifos.path.exists (tmpDir): Tmpfiles= [Os.path.join (tmpdir,tfile) forTfileinchOs.listdir (TmpDir)ifOs.path.isfile (Os.path.join (tmpdir,tfile))]Print('start to integrate incomplete downloads ...') Try: Merge (Tmpfiles,os.path.join (OS.GETCWD (),'Incomplete document. txt')) ifOs.path.exists (Os.path.join (OS.GETCWD (),'Incomplete document. txt')): Print('some chapters download successfully! ') Else: Print('Download failed! ') except: Print('Download failed! ') Sys.exit () Os.rmdir (TmpDir)Else: Print('Download failed! ') Sys.exit ()ifOs.path.exists (Os.path.join (OS.GETCWD (),'tmp'): Os.rmdir (Os.path.join (OS.GETCWD (),'tmp'))if __name__=="__main__": ifLen (SYS.ARGV) > 1: Main (sys.argv[1:]) #http://www.lueqiu.com/
:
Python3 using BEAUTIFULSOUP4 to capture the full text of the site's novel code