The main goal of this article is to obtain micro-blog comment data , including microblogging comment links , total comments , User ID, user nickname , comment Time , comment content , user details links and so on.
The implementation code looks like this:
#-*-Coding:utf-8-*-"" "Created on Tue Aug 8 16:39:07 2017 @author: Administrator" "" Import requests from BS4 IMP ORT beautifulsoup Import JSON import time import random import pymysql.cursors def crawldetailpage (url,page,i): #读取微 JSON Information req = requests.get (URL) jsondata = Req.text data = json.loads (jsondata) #评论数 commentcounts = Data[' Total_number '] print (the number of comments for {} on the {} page is {} ". Format (page,i+1,commentcounts)) #循环输出每一页的微博信息 for comment I n data[' data ']: userId = comment[' user ' [' id '] userName = comment[' user ' [' screen_name '] Commentti
me = comment[' created_at '] commenttext = comment[' text '] Userprofileurl = comment[' user ' [' profile_url '] Print ("User {}" is created in: {} ". Format (username,commenttime)) Print (Comment content: {}". Format (Commenttext)) print ("User Details link is: {} ". Format (Userprofileurl))" ' Database operations ' #获取数据库链接 connection = Pymysql. Connect (host = ' LocalhosT ', user = ' root ', password = ' 123456 ',
db = ' Weibo ', charset = ' utf8mb4 ') Try: #获取会话指针 With Connection.cursor () as cursor: #创建sql语句 sql = INSERT INTO ' comment ' (' Co Mmenturl ', ' commentcounts ', ' userId ', ' userName ', ' commenttime ', ' commenttext ', ' Userprofileurl ') VALUES (%s,%s,%s,%s, %s,%s,%s) "#执行sql语句 cursor.execute (SQL, url,commentcounts,userid,username,commenttime,com Menttext,userprofileurl)) #提交数据库 connection.commit () finally:connect Ion.close () def Crawl (url,page): #读取微博网页的JSON信息 req = requests.get (URL) jsondata = Req.text data = json. Loads (jsondata) #print (data[' cards '][5][' scheme ')) #获取每一页的数据 content = data[' cards '] #print (content[6][' SC Heme ']) #循环输出每一页微博的详情链接 For I in range (2,11): ContentID = content[i][' Mblog ' [' id '] #contentUrl = ' https://m.weibo.cn/status/' + Co Ntentid Commenturl = "https://m.weibo.cn/api/comments/show?id=" +str (ContentID) #print ("The details of the article {} microblogging link are: {}". f Ormat (I+1,commenturl)) crawldetailpage (commenturl,page,i) t = random.randint (11,13) print (Hibernate time is: {
}s ". Format (t)) Time.sleep (t) for I in Range (1,2): print (" Getting the first {} page microblogging data: ". Format (i)) #知乎官方微博数据的JSON链接 url = "https://m.weibo.cn/api/container/getindex?uid=1939498534&type=uid&value=1939498534& Containerid=1076031939498534&page= "+ str (i) crawl (url,i) #设置休眠时间 time.sleep (Random.randint (31,33))
the partial run results are shown in the following illustration:
the data in the MySQL database is stored as shown in the following illustration: