python擷取網站資訊

來源:互聯網
上載者:User

標籤:python爬蟲學習

#coding:utf-8import urllib2import osimport sysreload(sys)sys.setdefaultencoding("utf-8")from bs4 import BeautifulSoupheads = {}heads[‘User-Agent‘] = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36‘request=urllib2.Request("http://www.kugou.com" ,headers=heads)#建立對酷狗官網get請求result=urllib2.urlopen(request)#發出請求soup=BeautifulSoup(result.read(),‘html.parser‘)#產生可分析對象for i in soup.find_all("div"):#遍曆所有div標籤 if i.get("id")=="SongtabContent":#判斷id為SongtabContent的div標籤 s=i.find_all("li")#把所有li標籤內容賦值給s變數with open(u"C://downloads//lw//a.txt","w") as f:#建立要寫入檔案對象 for i in s:#遍曆所有li標籤對象 f.write(u"歌曲名稱為: %s " % i.a.select(".songName")[0].text)#擷取class為songName的值 f.write(u"歌曲播放串連為: %s " % i.a.get("href")) #擷取標籤為href的值 f.write(u"歌曲播放時間為: %s" % i.a.select(".songTime")[0].text) #擷取class為songTime的值 f.write(os.linesep)def shoufu(): import requests import re resq = requests.get("http://www.sohu.com")#請求搜狐網站 print resq.text[:100]#列印響應結果前一百行 links = re.findall(r‘href="(.*?)"‘, resq.text)#尋找所有包含href內容 print len(links) valid_link = []#儲存有效串連 invalid_link = []#儲存無效串連 for link in links: if re.search(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$", link.strip()):#資源串連篩選出來 print 6, link invalid_link.append(link.strip()) continue#進入此判斷之後執行完直接執行下一次迴圈 elif link.strip() == "" or link.strip() == "#" or link.strip() == "/":#無效內容篩選去除 # print 1,link invalid_link.append(link) continue elif link.strip().startswith("//"):#把有效相對串連篩選儲存 # print 2,link valid_link.append("http:" + link.strip()) continue elif link.strip().count("javascript") >= 1 or link.strip().count("mailto:") >= 1:#引用js串連及郵箱超級串連去除 # print 3,link invalid_link.append(link.strip()) continue elif re.match(r"/\w+", link):#把剩下所有內容串連再做進一步篩選 # print 5,link if re.match(r"http://.*?/", resq.url.strip()):#http開頭串連篩選 valid_link.append(re.match(r"http://.*?/", resq.url.strip()).group() + link.strip())#把串連以/結尾內容儲存 else: valid_link.append(re.match(r"http://.*", resq.url.strip()).group() + link.strip())#把串連以內容結尾儲存 continue else: # print 7,link valid_link.append(link.strip())#篩選剩下的內容都儲存到有效列表中 # for link in valid_link[:100]: # print link print len(valid_link) # for link in invalid_link: # print link print len(invalid_link) file_num = 1#為建立檔案準備 for link in list(set(valid_link)): # print link resq = requests.get(link, verify=True)#允許認證校正並訪問所有儲存的有效串連 if u"籃球" in resq.text:#篩選網頁內容中是否存在“籃球”內容 print link if u‘meta charset="utf-8"‘ in resq.text:#判斷網頁是否以utf-8編碼 with open(r"c:\\downloads\\lw\\" + str(file_num) + ".html", "w") as fp: fp.write(resq.text.strip().encode("utf-8"))#編碼內容為utf-8後儲存到指定目錄 else: with open(r"c:\\downloads\\lw\\" + str(file_num) + ".html", "w") as fp: fp.write(resq.text.strip().encode("gbk"))#編碼內容為gbk後儲存到指定目錄 file_num += 1 print "Done!"

python擷取網站資訊

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.