Import requests
From lxml import etree
Import re
Import Pymongo
Import time
Client = Pymongo. Mongoclient (' localhost ', 27017)
MyDB = client[' MyDB ']
Musictop = mydb[' Musictop ']
Headers ={' user-agent ': ' mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; trident/5.0) '}
def get_url_music (URL):
html = requests.get (url,headers=headers)
selector = etree. HTML (Html.text)
Music_hrefs = Selector.xpath ('//a[@class = "NBG"]/@href ')
For Music_href in Music_hrefs:
Get_music_info (MUSIC_HREF)
def get_music_info (URL):
html = requests.get (url,headers=headers)
selector = etree. HTML (Html.text)
Name = Selector.xpath ('//*[@id = "wrapper"]/h1/span/text () ') [0]
# author = selector.xpath ('//*[@id = ' info ']/span[1]/span/a/text () ')
Author = re.findall (' Performer:.*?> (. *?) </a> ', Html.text,re. S
Styles = Re.findall (' <span class= ' pl ' > Genre:</span> (. *?) <br/> ', Html.text,re. S
If len (styles) = = 0:
style = ' Unknown '
Else
style = Styles[0].strip ()
Time = Re.findall (' Release:</span> (. *?) <br/> ', html.text,re. S) [0].strip ()
Publishers = Re.findall (' publisher:.*?> (. *?) </a> '. Html.text,re. S
If Len (publishers) = = 0:
Publisher = ' Unknown '
Else
Publisher = Publishers[0].strip ()
Score = Selector.xpath ('//[@id = "Interest_sectl"]/div/div[2]/strong/text () ') [0]
Print (Name,author,style,time,publisher,score)
info = {
' Name ': Name,
' Author ': Author,
' Style ': style,
' Time ': Time,
' publisher ':p ublisher,
' Score ': Score
}
Musictop.insert_one (Info)
if __name__ = = ' __main__ ':
URLs = [' https://music.douban.com/top250?start={} ', Format (str (i)) for I in Range (0,250,25)]
For URL in URLs:
Get_url_music (URL)
Time.sleep (2)
Issue: Various database installation failures cannot debug various code
Crawl Watercress top250 Music time publisher deposit MONGO database