Beautiful Soup is a python library for parsing HTML and XML. It can parse files the way you like, find and modify parse trees. It can handle irregular markup well and generate parse tree. It provides simple and common navigation, searching, and modifying parse tree operations.
Use the urllib2 and BS4 modules to crawl the html page data, as shown in the figure: title, content, stock name, stock ID, release time, and onlookers.
urllib2 and BeautifulSoup crawl data in python to save MongoDB-urllib beautifulsoup
Example:
code show as below
##-coding: utf-8-##
import time
from bs4 import BeautifulSoup
import urllib2
import pymongo
import re
import datetime
def update ():
datas = ()
connection = pymongo.Connection ('192.168.1.2', 27017)
#Connecting mongodb
db = connection.test_hq
#Create or connect test_hq library
for i in soup.find_all ("div", class _ = "item"):
datas ['_ id'] = str (i.h2.a ['href']). split ('/') [-1] .split ('.') [0]
#Get html page name as id number
datas ['title'] = i.h2.get_text ()
#Get title
url2 = i.h2.a ['href']
#Get title content url address
html2 = urllib2.urlopen (url2)
html_doc2 = html2.read ()
soup2 = BeautifulSoup (html_doc2)
datas ['content'] = soup2.find (attrs = {"name": "description"}) ['content']
#Get article content
stock_name = []
stock_id = []
for name in re.findall (u "[u4e00-u9fa5] +", i.find (class _ = "stocks"). get_text ()):
stock_name.append (name)
#Get the affected stock name, the corresponding stock id number has been stored in an array, mongo supports array insertion
datas ['stock_name'] = stock_name
for id in re.findall ("d +", i.find (class _ = "stocks"). get_text ()):
stock_id.append (id)
#Get impact stock id
datas ['stock_id'] = stock_id
datas ['update_time'] = datetime.datetime.strptime (re.search ("w +. * w +", i.find (class _ = "fl date"). span.get_text ()). group (), '% Y -% m-% d% H:% M ')-datetime.timedelta (hours = 8)
#Get publishing time, convert to mongo time format
datas ['onlooker'] = int (re.search ("d +", i.find (class _ = "icons ic-wg"). get_text ()). group ())
#Get onlookers
db.test.save (datas)
#Insert database
def get_data ():
title = str (soup.h2.a ['href']). split ('/') [-1] .split ('.') [0]
#Get html page name for update judgment
with open ('update.txt', 'r') as f:
time = f.readline ()
if title == time:
print 'currently no update', title
else:
with open ('update.txt', 'w') as f:
f.write (title)
update ()
while True:
if __name__ == '__main__':
url = 'http://www.ipython.me/qingbao/'
html = urllib2.urlopen (url)
html_doc = html.read ()
soup = BeautifulSoup (html_doc)
get_data ()
time.sleep (30)
#Refresh every 30 seconds