python 爬取烏雲所有廠商名字,url,漏洞總數 並存入資料庫

來源:互聯網
上載者:User

標籤:

需要:MySQLdb 
下面是資料表結構:

 
/*Navicat MySQL Data Transfer Source Server         : 127.0.0.1Source Server Version : 50509Source Host           : 127.0.0.1:3306Source Database       : wooyun Target Server Type    : MYSQLTarget Server Version : 50509File Encoding         : 65001 Date: 2015-09-24 17:38:14*/ SET FOREIGN_KEY_CHECKS=0; -- ------------------------------ Table structure for wooyun_vul-- ----------------------------DROP TABLE IF EXISTS `wooyun_vul`;CREATE TABLE `wooyun_vul` (  `id` int(8) NOT NULL AUTO_INCREMENT,  `corpsname` varchar(255) DEFAULT NULL,  `corpsurl` varchar(255) DEFAULT NULL,  `vulcount` int(255) DEFAULT NULL,  PRIMARY KEY (`id`)) ENGINE=InnoDB DEFAULT CHARSET=latin1;

python 指令碼:

#conding=utf-8import urllib2import urllibimport reimport MySQLdb url = "http://wooyun.org/corps/page/"def getWooyuncorps(url):    request = urllib2.Request(url)    request.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36‘)    reponse = urllib2.urlopen(request)    content = reponse.read()    pattern1 = re.compile(r‘<td width="370"><a href="\/corps\/(.*?)">.*?<\/a><\/td>‘)    pattern2 = re.compile(r‘<a rel="nofollow" href="(.*?)" target=‘)    corps = pattern1.findall(content)    corpsUrl = pattern2.findall(content)    return corps,corpsUrl def getcorpscount(url):    request = urllib2.Request(url)    request.add_header(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36‘)    reponse = urllib2.urlopen(request)    content = reponse.read()    pattern = re.compile(r‘<p class="page">.*?(\d+).*‘)    count = pattern.findall(content)    return count corpslist = []corpsurllist = []countlist = []for i in range(1,37):    corps,corpsUrl = getWooyuncorps(url+str(i))    for corp in corps:        corpslist.append(corp)    for urls in corpsUrl:        corpsurllist.append(urls)print len(corpslist),len(corpsurllist) for i in range(0,len(corpslist)):    newurl = "http://www.wooyun.org/corps/"+urllib.quote(corpslist[i])    #print newurl    count = getcorpscount(newurl)    #print count    for countA in count:        countlist.append(countA) #print len(countlist)conn = MySQLdb.connect(‘localhost‘,‘root‘,‘‘,‘wooyun‘)cur = conn.cursor()sql = "set names ‘utf8‘"cur.execute(sql)conn.commit() for s in range(0,len(countlist)):    sql = ‘insert into wooyun_vul(corpsname,corpsurl,vulcount) values("%s","%s",%d)‘ %(corpslist[s],corpsurllist[s],int(countlist[s]))    print sql    cur.execute(sql)    conn.commit() conn.close()print "success"

  

python 爬取烏雲所有廠商名字,url,漏洞總數 並存入資料庫

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.