Evening back to learn to learn reptiles, remember, a lot of Web site General novice is not crawled out, to a simple, look down:
Import Urllib.request
From BS4 import BeautifulSoup #我用的pycharm需要手动导入这个包的
Import lxml #同上
def gethtml (url,headers):
req = Urllib.request.Request (Url=url, Headers=headers)
Res =urllib.request.urlopen (req)
html = Res.read ()
return HTML
def savetxt (path,html):
f = open (path, ' WB ')
F.write (HTML)
def prasehtml (Currenturl,headers,path):
# html = html.decode (' Utf-8 ')
Chapter = 0
Flag = 1
While flag:
Chapter = Chapter+1
If chapter >=: #控制下载的数量, too many data computers are going to explode.
Flag = 0 #停止下载
html = gethtml (currenturl,headers)
Savepath = path + "\ \" +str (chapter) + ". txt"
f = open (Savepath, "w")
Soup =beautifulsoup (HTML, "lxml") #注意这里是lxml格式, the first time I actually wrote HTML, I would have been careless.
Nametext = Soup.find (' h3 ', attrs={' class ': ' J_chaptername '})
ContentText = Soup.find (' div ', attrs={' class ': ' Read-content j_readcontent '})
result = Nametext.gettext () + ' \ n ' +contenttext.gettext ()
result = Result.replace (', ' \ n ')
f = open (Savepath, "w")
F.write (Result)
NextPage = Soup.find (' A ', attrs={' id ': ' j_chapternext '})
If Next:
Currenturl = "http:" + nextpage[' href ')
Else
Currenturl = None
Flag = 0
def main ():
url = "https://www.readnovel.com/chapter/22160402000540402/107513768840595159"
headers = {
' User-agent ': ' mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/59.0.3071.115 safari/537.36 '} #请求头自己可以再网页中查看 (f12->network-> Refresh
Path = "D:\\novel"
Prasehtml (Url,headers,path)
Main ()
A beginner of Python who knows a novel crawler.