Want to do a news text recognition classification of the project, first wrote a crawl Baidu news crawler.
Environment: Win7 Three-bit python3.4 several third-party libraries
Can realize the function: regularly follow the category of Baidu News crawl news headlines, categories and text content, and automatically stored in the database (MySQL), while sending mail to their own mailbox reminders.
Defects: Due to different sources of news, the page code is different, there will be a small number of garbled phenomenon; no auto-redo function is added to the database (it is not difficult to do it manually, so I didn't study it)
STEP1:creat_dbtable.py Linked Database Creation table (can also be directly manipulated by MySQL)
#-*-Coding:utf-8-*-"" "Created on Sun Nov 6 23:31:33 2016@author:administrator" "" #数据库创建操作import mysqldb# Open Database link db = MySQLdb.connect (host= "localhost", user= "root", passwd= ' your password ', db= "test", Use_unicode=true, charset= "UTF8") cursor = Db.cursor () #如果数据已经存在, use the Excute () method to delete the table Cursor.execute ("DROP table IF EXISTS News") #创建数据表SQL语句sql = "" CREATE Table News ( Class varchar (TEN) not null,title varchar (+), text VARCHAR (15000)) "" "Cursor.execute (SQL) #关闭数据库连接db. Close ()
In MySQL see the table has been generated:
Step2: In order to understand each crawl situation, write a send_email.py to achieve the function of sending mail, this file in the spider main file to call.
Note: this to their own mailbox to send mail to the corresponding mailbox to open the service to get a password, this online tutorial is also more, after a short time will be added.
#coding: Utf-8from email.header import headerfrom email.mime.text import mimetextfrom email.utils import parseaddr, Formataddrimport smtplibdef _format_addr (s): name, addr = parseaddr (s) return formataddr (Header (name, ' Utf-8 '). Encode (), addr)) def Send_ms (T): from_addr = "[email protected]" password = ' Your-password ' to_addr = ' [ Email protected] ' smtp_server = ' smtp.qq.com ' msg = Mimetext (T, ' plain ', ' utf-8 ') msg[' from '] = _format_ Addr (' anyone ') msg[' to '] = _format_addr (' Echo ') msg[' Subject '] = Header (' The New report ', ' Utf-8 '). Encode () Server = Smtplib. Smtp_ssl (Smtp_server, 465, timeout=10) server.set_debuglevel (0) server.login (From_addr,password) Server.sendmail (FROM_ADDR, [to_addr], msg.as_string ()) server.quit () # Send_ms (T)
Step3: Create a spider.py file for specific functions.
#-*-Coding:utf-8-*-"" "Created on Sun Nov 6 21:24:27 2016@author:administrator" "" Import reimport timeimport Requestsi Mport NumPy as Npimport send_emailfrom BS4 import beautifulsoupfrom Collections import Counterimport Mysqldbstart = time.t IME () #打开数据库链接db = MySQLdb.connect (host= "localhost", user= "root", passwd= ' password ', db= "test", Use_unicode=true, charset= "UTF8") cursor = db.cursor () headers = {' user-agent ': "mozilla/5.0 (Windows NT 6.1) applewebkit/537.36 (khtml, like Gecko) chrome/53.0.2785.143 safari/537.36 "}# get home Data head_datadef get_head_data (): Head_url = '/http/ internet.baidu.com/' data = Requests.get (head_url,headers=headers) data.encoding = ' GBK ' # print (data.status_code Head_data = data.text return head_data# get title and Hrefdef Get_class (head_data) for each news Category: Title_href = {} PA = re. Compile (R ' <a href= "(http.*?). com/). *?>.*? (\w+) </a></li> ') Ma = Re.findall (pa,head_data) [1:-7] ma = List (set (MA)) [: -1] # print (Len (MA)) for I in range (Len (MA)): key = ma[i][1] Value = ma[i][0] Title_href[key] = value # print (title_href) return Ti tle_href# Extract header information for each taxonomy class_datadef Get_class_data (class_url): Class_data = Requests.get (Class_url, Headers=headers) PA = re.compile (R ' charset= (. *?) " > ') charset = Re.findall (Pa,class_data.text) [0] class_data.encoding = CharSet # class_data.encoding = ' GBK ' Class_data =class_data.text soup = beautifulsoup (class_data, ' lxml ') data = Soup.findall (' A ', {' target ': ' _blank '}) Class_data = {} for I in range (len (data)): title = Data[i].get_text () href = data[i].get (' href ') If Len (title) > 10:if not ' Download ' in title:class_data[title] = href return class_data# get The specific text content of each piece of news, roughly grab def get_news_text (href): Try:data = Requests.get (href,headers=headers) # data.encoding = ' GBK ' PA = re.compile (R ' charset= (. *?) " > ') charset = Re.findall (Pa,data.text) [0] Data.encoding = CharSet data = BeautifulSoup (Data.text, ' lxml '). Get_text () Text = Re.sub ("[A-za-z0-9\[\ ' \~\!\@\#\$\ \^\ "\-\+\_\\&\\n\\t\*\ (\) \=\|\{\}\ ' \:\;\ ' \,\[\]\.\<\>\/\?\~\! \@\#\\\&\*\%] "," ", data) except: # print (' Get New Text fail ... ') Text = None Pass return t Exthead_data = Get_head_data () Title_href = Get_class (head_data) count = 0for class_title,class_href in Dict (TITLE_HREF). Items (): Print (class_title) # try:class_data = Get_class_data (class_href) # except: # Print (' Get class ' Data fail ... ') # Pass for News_title, News_url in Class_data.items (): # print (news_title) Text = Get_news_text (news_url) sql = "" INSERT into News SET class=%s, title=%s, text=%s "" "Try:curso R.execute (SQL, (Class_title,news_title,text)) Db.commit () Count + = 1 except: # Prin T (' Save fail ... ') passdb.close () end = Time.time () Total_time = end- StartT1 = ' This fetch takes time%s '%str (total_time) T2 = ' & Total fetch%s News '%str (count) T = t1+t2# print (t1,t2) Send_email.send_ms (t)
Database Storage conditions:
Email Details:
REMARK: For Windows timed tasks, refer to this tutorial.
This is the setup and operation of my own scheduled tasks
Windows timed to execute Baidu news crawler