Script implementation: Get the URL of a bull article on a 51cto website and store it in a database.
#!/usr/bin/env python#coding:utf-8from bs4 import beautifulsoupimport urllibimport reimport MySQLdbk_art_name = []v_art_url = []db = MySQLdb.connect (' 192.168.115.5 ', ' blog ', ' blog ', ' blog ') cursor = db.cursor () for page in Range (1,5):p age = str (page) url = ' http://yujianglei.blog.51cto.com/all/7215578/page/' + pagerequest = urllib.urlopen (URL) response = request.read () response = unicode (response, ' GBK '). Encode (' UTF-8 ') soup = beautifulsoup (response, ' html.parser ') A_ Tag = soup.find_all (Href=re.compile ("^/\d{7,}/\d{7,}$")) For i in a_tag:art_name = i.stringart_url = ' http://yujianglei.blog.51cto.com ' + i[' href ']k_art_ Name.append (Art_name) v_art_url.append (art_url) If len (k_art_name) == len (V_art_url): Number1 = len (K_art_name) Number2 = len (V_art_url) For j in range (number1): a = k_art_name[j]b = v_art_url[j]print a,bsql = ' Insert into blog_ 51cto (Art_name,art_url) VALUES ("%s", "%s") ' % (A, B) try: Cursor.execute (SQL) Db.commit () Except exception, e:print edb.rollback () else:print ' K_art_name: ', len (k_art_name) print ' V_art_url: ' , len (V_art_url) cursor.close () db.close () # insert into blog_51cto (Art_name,art_url) VALUES ("oracle manual archive, Auto Archive, archive location, archive process", "/HTTP/ yujianglei.blog.51cto.com/7215578/1560485 ") #CREATE DATABASE blog; #CREATE TABLE ' Blog_51cto ' (# ' id ' int (one) NOT NULL AUTO_INCREMENT,# ' Art_ Name ' varchar ( DEFAULT NULL,#&NB)sp; ' Art_url ' varchar ($) DEFAULT NULL,# PRIMARY KEY (' Id ') # ) ENGINE=InnoDB AUTO_INCREMENT=1609 DEFAULT CHARSET=utf8;
This article from the "Do not ask for the best, only better" blog, please be sure to keep this source http://yujianglei.blog.51cto.com/7215578/1771143
Python crawler Crawl 51cto blog Daniel's article saved to MySQL database