標籤:
朋友說他的站掛了,想知道被收錄的頁面有多少是死鏈,於是我就想了一下流程,從Site獲得收錄數量當然是不精準的,不過也沒有更好的地了,真實的收錄只有搜尋引擎資料庫裡面才有。。。
查詢被收錄頁面的狀態代碼,流程:擷取收錄網址 > 解析真實URL > 擷取狀態代碼
不過執行起來比較慢,不知道是BeautifulSoup還是 Location 擷取真實URL地址這步慢了
#coding:utf-8import urllib2,re,requestsfrom bs4 import BeautifulSoup as bsdomain = ‘www.123.com‘ #要查詢的網域名稱page_num = 10 * 10 #第一個數字為要抓取的頁數def gethtml(url): headers = { ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘, # ‘Accept-Encoding‘:‘gzip, deflate, sdch‘, ‘Accept-Language‘:‘zh-CN,zh;q=0.8‘, ‘Cache-Control‘:‘max-age=0‘, ‘Connection‘:‘keep-alive‘, ‘Cookie‘:‘BDUSS=ng4UFVyUUpWU2hUR2R3b3hKamtpaE9ocW40LTFZcGdWeDBjbXkzdE83eDJQSE5YQVFBQUFBJCQAAAAAAAAAAAEAAADD3IYSamFjazE1NDUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHavS1d2r0tXa; ispeed_lsm=2; PSTM=1465195705; BIDUPSID=2274339847BBF9B1E97DA3ECE6469761; H_WISE_SIDS=102907_106764_106364_101556_100121_102478_102628_106368_103569_106502_106349_106665_106589_104341_106323_104000_104613_104638_106071_106599_106795; BAIDUID=D94A8DE66CF701AB5C3332B1BF883DDC:FG=1; BDSFRCVID=UEusJeC62m80hjJRoxzDhboaBeKaL6vTH6aIa6lTlb9Zx-72yRF7EG0PfOlQpYD-d1GyogKK3gOTH4jP; H_BDCLCKID_SF=fR-foIPbtKvSq5rvKbOEhPCX-fvQh4JXHD7yWCvG3455OR5Jj65Ve58JM46N2bvE3IbaWbjP5lvH8KQC3MA--fF_jxvn2PD8yj-L_KoXLqLbsq0x0-jchh_QWt8LKToxMCOMahkb5h7xOKbF056jK4JKjH0qt5cP; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02157232777; BD_HOME=1; BD_UPN=12314353; sug=3; sugstore=1; ORIGIN=0; bdime=0; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; H_PS_645EC=a5cfUippkbo0uQPU%2F4QbUFVCqXu4W9g5gr5yrxTnJT10%2FElVEvJBbeyjWJq8QUHgepjd; BD_CK_SAM=1; BDSVRTM=323; H_PS_PSSID=1434_20317_12896_20076_19860_17001_15506_11866; __bsi=16130066511508055252_00_0_I_R_326_0303_C02F_N_I_I_0‘, # ‘Host‘:‘www.baidu.com‘, ‘Upgrade-Insecure-Requests‘:‘1‘, ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36‘, } req = urllib2.Request(url=url,headers=headers) html = urllib2.urlopen(req,timeout = 30).read() return htmldef status(url): #返回狀態代碼 status = requests.get(url).status_code return statusstatus_file = open(‘url_status.txt‘,‘a+‘)for i in range(10,page_num,10): url = ‘https://www.baidu.com/s?wd=site%3A‘ + domain + ‘&pn=‘ + str(i) html = gethtml(url) soup = bs(html,"lxml") for i in soup.select(‘.c-showurl‘): # print i.get(‘href‘) urls = i.get(‘href‘) # url_list.append(urls) header = requests.head(urls).headers header_url = header[‘location‘] #擷取真實URL if int(status(header_url)) == 404: print status(header_url),header_url #列印狀態代碼和真實URL status_file.write(str(status(header_url)) + ‘ ‘ + header_url + ‘\n‘) #擷取的狀態代碼和連結寫入檔案status_file.close()#擷取狀態代碼函數
借鑒的程式碼片段
#coding: utf-8import sysimport urllibimport urllib2from BeautifulSoup import BeautifulSoup question_word = "吃貨 程式員"url = "http://www.baidu.com/s?wd=" + urllib.quote(question_word.decode(sys.stdin.encoding).encode(‘gbk‘))htmlpage = urllib2.urlopen(url).read()soup = BeautifulSoup(htmlpage)print len(soup.findAll("table", {"class": "result"}))for result_table in soup.findAll("table", {"class": "result"}): a_click = result_table.find("a") print "-----標題----\n" + a_click.renderContents()#標題 print "----連結----\n" + str(a_click.get("href"))#連結 print "----描述----\n" + result_table.find("div", {"class": "c-abstract"}).renderContents()#描述 print
查詢被收錄頁面中的死連結 By SEO