Grab mass Reviews home page left message,
We want to implement the Chinese names are stored in MongoDB, and each link into the Redis database.
Because each message will have a corresponding ID when it is stored in MongoDB, it is convenient for us to deposit Redis without error.
#-*-Coding:utf-8-*-import refrom urllib.request import urlopenfrom urllib.request import requestfrom BS4 import Beauti Fulsoupfrom lxml import Etreeimport pymongoclient = Pymongo. Mongoclient (host= "127.0.0.1") db = client.dianping #库名dianpingcollection = db.classification #表名classif Icationimport Redis #导入redis数据库r = Redis. Redis (host= ' 127.0.0.1 ', port=6379, db=0) # client = Pymongo. Mongoclient (host= "192.168.60.112") # Myip = client[' Myip '] # Name the database def secclassfind (selector, classid): Secitems = Sele Ctor.xpath ('//div[@class = ' sec-items ']/a ') for secitem in Secitems:url = Secitem.get (' href ') #得到url title = Secitem.text ClassID = Collection.insert ({' classname ': title, ' pid ': classid}) Classurl = '%s,%s '% (ClassID, URL) #拼串 r.lpush (' Classurl ', Classurl) #入库def public (URL): headers = {' user-agent ': ' mozilla/5.0 (W indows; U Windows NT 6.1; En-us; rv:1.9.1.6) gecko/20091201 firefox/3.5.6 '} #协议头 req_timeout = 5 req = Request (Url=url, headers=headers) F = urlopen (req, None, req_timeout) s = f.read () s = S.decode ("utf- 8 ") # BeautifulSoup Extract soup = BeautifulSoup (S, ' html.parser ') links = soup.find_all (Name= ' Li ', class_=" First-item " ) for link in links:selector = etree. HTML (str) # indextitleurls = Selector.xpath ('//a[@class = "Index-title"]/@href ') # # Gets the first-level category URL and title # for Titleurl in Indextitleurls: # print (titleurl) indextitles = Selector.xpath ('//a[@class = "index- Title "]/text ()") for title in Indextitles: # Second level URL print (title) ClassID = Collect Ion.insert ({' classname ': title, ' pid ': None}) Secclassfind (selector, classid) print ('---------') # Secitems = Selector.xpath ('//div[@class = "Sec-items"]/a ') # for Secitem in Secitems: # print (secite M.get (' href ')) # print (secitem.text) print ('-----------------------------') # # Myip.collection.insert ({' Name ': Secitem.text}) # R.lpush (' MyList ', secitem.get (' href ')) # Colle Ction.find_one ({' _id ': ObjectId (' 5a14c8916d123842bcea5835 ')}) # connection = Pymongo. Mongoclient (host= "192.168.60.112") # Connection Mongdb Database # post_info = connection.myip # Specify database name (yande_test), no then create # post_sub = PO St_info.test # Get Collection name: Testpublic (' http://www.dianping.com/')
Python crawls public reviews and writes to MONGODB database and Redis database