See the tutorial on the Internet, but it is written with regular expressions, and can not run, I used to change the XPath, and then re-write the logic, and the use of double-threaded, is also the original it
#!/usr/bin/python
#-*-Encoding:utf-8-*-
From lxmlImport etree
From Multiprocessing.dummyImport PoolAs ThreadPool
Import requests
Import Sys
#Coding
Reload (SYS)
Sys.setdefaultencoding (' Utf-8 ')
#Defining output functions
def towrite (contentdict):
F.writelines (UAuthor: ' + contentdict[' Author '] +‘\ n‘)
F.writelines (UContent: ' + contentdict[' Content ' +‘\ n‘)
F.writelines (UFunny: ' + contentdict[' Vote '] +‘\ n‘)
F.writelines (UComments: ' + contentdict[' Span ' +‘\ n‘)
def spider (URL):
#Get the page code
User_agent =' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
headers = {' User-agent ': user_agent}
html = requests.get (URL,Headers=headers)
#Get content
selector = etree. HTML (Html.text)
Content_field = Selector.xpath ('//*[@id = ' Content-left ']/div[@class = "Article Block untagged mb15"]
item={}
For IInchRangeLen (Content_field)):
#Author
Author_f= Content_field[i].xpath (' Div[@class = "author Clearfix"]) [0]
Author=author_f.xpath (' String (.) '). Replace‘\ n‘,"). Replace (‘ ‘,‘‘)
#Content
Content_f=content_field[i].xpath (' div[@class = ' content ']/text () ')
Content=‘‘
For nInchRangeLen (content_f)):
Content_temp=content_f[n].replace (‘\ n‘,"). Replace (‘ ‘,"). Replace (‘\ t‘,‘‘)
content+=STR (CONTENT_TEMP)
#Funny
Vote=‘‘
Vote_temp= Content_field[i].xpath (' div[@class = ' Stats ']/span[@class = ' stats-vote ']/i/text () ') [0]
vote+=STR (VOTE_TEMP)
#Comments,If the comment is empty, it is not displayedINode
span=‘‘
Span_temp_l= Content_field[i].xpath (' div[@class = ' Stats ']/span[@class = ' stats-comments ']/a/i/text () ')
Span_temp=[]
IfLen (span_temp_l) >0:
span_temp=span_temp_l[0]
Else
span_temp=' 0 '
span+=STR (SPAN_TEMP)
item[' Author '] = author
item[' Content ' = content
item[' vote '] = vote
item[' Span ' = span
Towrite (item)
if __name__ = = ' __main__ ':
Pool = ThreadPool (4)
f = open (' content.txt ',' a ')
URL = []
For I in range (1,36):
NewPage = ' http://www.qiushibaike.com/hot/page/' + str (i)
Url.append (NewPage)
results = Pool.map (spider, url)
Pool.close ()
Pool.join ()
F.close ()
Python crawler: Crawling embarrassing Encyclopedia