Jingdong Book review has a very rich information, which contains the date of purchase, the title, author, Praise, in the evaluation, the difference between the evaluation and so on. Take the purchase date as an example, using Python + MySQL with the implementation of the program is not large, only 100 lines. I have raised the relevant explanations in the program:
From selenium import Webdriver
From BS4 import BeautifulSoup
Import re
Import Win32com.client
Import Threading,time
Import MySQLdb
Def mydebug ():
Driver.quit ()
Exit (0)
Def catchdate (s):
"" Page Data Extraction ""
soup = BeautifulSoup (s)
z = []
global nowtimes
m = Soup.findall ("Div", class_= "Date-buy")
for obj in M:
try:
tmp = obj.find (' br '). Contents
except Exception, E:
continue
if (tmp!= ""):
z.append (TMP)
Nowtimes + = 1
return z
def gettimes (n,t):
"" To get the current progress ""
Return "Current progress is:" + str (int (100*n/t)) + "%"
# ——————————————————————————————————— | Program Start | —————————————————————————————————
#确定图书大类
Cate = {"3273": "History", "3279": Psychology "," 3276 ":" Political Military "," 3275 ":" Chinese Classics "," 3274 ":" Philosophical Religion "," 3277 ":" Law "," 3280 ":" Culture "," 3281 ":" Social Sciences " }
#断点续抓
NUM1 = Input ("BookID:")
num2 = Input ("pagenumber:")
#生成图书大类链接, total need 17355*20 = 347,100 times
Totaltimes = 347100.0
Nowtimes = 0
#开启webdirver的PhantomJS对象
#driver = Webdriver. PHANTOMJS ()
Driver = Webdriver. Ie (' C:\Python27\Scripts\IEDriverServer ')
#driver = Webdriver. Chrome (' C:\Python27\Scripts\chromedriver ')
#读出Mysql中的评论页面, to crawl
# Connect to the database
Try
conn = MySQLdb.connect (host= ' localhost ', user= ' root ', passwd= ' ", db= ' JD ')
Except Exception, E:
Print E
Sys.exit ()
# Get Cursor Object
cursor = Conn.cursor ()
sql = "SELECT * from BookNew ORDER by pagenumber DESC"
Cursor.execute (SQL)
AllData = Cursor.fetchall ()
Flag = 0
Flag2 = 0
# If there's data to return on the loop output, http://club.jd.com/review/10178500-1-154.html
If AllData:
For Rec in AllData:
#rec [0]--bookid,rec[1]--cateid,rec[2]--pagenumber
if (rec[0]!= str (NUM1) and flag = = 0):
Continue
Else
Flag = 1
For P in range (num2,rec[2]):
if (Flag2 = 0):
num2 = 0
Flag2 = 1
p = 1
link = "http://club.jd.com/review/" + rec[0] + "a" + str (p) + ". html"
#抓网页
Driver.get (link)
HTML = Driver.page_source
#抓评论
Buydate = catchdate (HTML)
#写入数据库
For z in Buydate:
sql = "INSERT into LJJ (ID, Cateid, bookid, date) VALUES (NULL, ' + rec[0] +" ', ' "+ rec[1] +" ', ' + z[0] + "');"
Try
Cursor.execute (SQL)
Except Exception, E:
Print E
Conn.commit ()
Print Gettimes (nowtimes,totaltimes)
Driver.quit ()
Cursor.close ()
Conn.close ()