1 ImportRe2 Importurllib.request3 fromBs4ImportBeautifulSoup4 Import Time5 6Url=input ("first page URL:")7 8 defgethtml (URL):9 #get page source code HTMLTenPage=urllib.request.urlopen (URL) OneHtml=page.read (). Decode ('Utf-8')#HTML is a list ASoup=beautifulsoup (HTML,'Html.parser') - - returnSoup the - - defGetbookurl (soup):#get the link address for all books on this page -Firsturl2=[] +Bookurl=soup.find_all ("h4") -Bookurl1=re.findall (R' "_blank"', str (bookurl)) + forIinchRange (0,len (BOOKURL1)): ABookurl="http:"+Bookurl1[i] at -Soup1=gethtml (Bookurl)#get the URL for the first chapter of each book -Time.sleep (0.2) -Firsturl=soup1.find_all ("a",{"class":"red-btn J-getjumpurl"}) -Firsturl1=re.findall (R'data-firstchapterjumpurl= ". *?" href= "(. *?)" id= "readbtn" >', str (firsturl)) - iffirsturl1[0]=="':#we need to make a judgment here to prevent mistakes. in Continue - firsturl2.append (firsturl1[0]) to returnFirsturl2 + - the * $ defgetcontent (soup,load):Panax Notoginseng -Content=soup.find_all ("Div",{"class":"read-content j_readcontent"}) the +Content1=re.compile (R'<p> ([\s\s]*?) </p>') A theContent2=Content1.findall (str (content)) + -Content3=re.sub ("</?\w+[^>]*>","', content2[0]) $ $Content4=content3.replace ('. ','. \n\n\0\0\0')#here, the chapter content is completed - -Contentname=re.compile (R'') the -Contentname1=contentname.findall (str (soup))#Get Chapter NameWuyi thebook="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+Content4 - WuWith open (load,'a') as F: - About f.write (book) $ - - - defnextcontent (soup): A +Content=soup.find_all ("Div",{"class":"Chapter-control Dib-wrap"}) the - #print (str (content)) $ theStep=re.compile (R'<a data-eid= "qd_r109" href= "(. *?)" id= "J_chapternext" >') the thecontent1=Step.findall (str (content)) the - ifContent1 = = []: in theStep1=re.compile (R'<a data-eid= "qd_r118" href= "(. *?)" id= "J_chapternext" >') the AboutContent2=Step1.findall (str (content)) the theUrl="http:"+Content2[0] the + returnURL - Else: theUrl="http:"+Content1[0]Bayi the returnURL the - defPanduan (soup): - theContent=soup.find_all ("Div",{"class":"Chapter-control Dib-wrap"}) the the #print (str (content)) the -Step=re.compile (R'<a data-eid= "qd_r109" href= "(. *?)" id= "J_chapternext" >') the thecontent1=Step.findall (str (content)) the 94 returnContent1 the #------------------------------------------------------------------------- the the 98 About #------------------------------------------------------------------------- - 101 while1==1:102Soup2=gethtml (URL)103Firsturl2=Getbookurl (soup2)104 the forJinchRange (0,len (FIRSTURL2)):106Url="http:"+Firsturl2[j]107Soup1=gethtml ("http:"+Firsturl2[j])108Bookname=re.findall (R'', str (soup1))109Load="D:/88/%s.txt"%Bookname[0] theI=0111 while1==1: thesoup=gethtml (URL)113 getcontent (soup,load) theUrl=nextcontent (soup) thecontent1=Panduan (soup) theI+=1117 Print("Chapter%d Download complete"%i)118 119 ifContent1 = = []: - Break121 122Time.sleep (0.2)123 Print("-------------%d book download completed---------"% Int (j+1))124
Learn ing!!!
Python crawls Qidian a page all novels