Write a collection program in Python, because before writing a large half a year of PHP data collection, so to use Python to do, the process is similar to the Python syntax function implementation, but by using Python, you can learn some of the python things
#python和php的差异是什么, what are the advantages of Python versus PHP?
The experience of learning these days is that Python's application range is broader and more flexible, while PHP focuses on the Web backend.
First of all, Python and PHP are quite different, such as the comments in Chinese are to declare
#decoding =utf-8
#引入
#urllib subsequent data acquisition, PHP often use file_get_content (URL), Curl class to get the remote site address
Import Urllib
#不像PHP, Python uses a lot of features need to import a bit earlier, PHP also need this import-like operation, but estimates are dropped in the INI initialization file.
Import re
Import OS
Import Sys
Import MySQLdb
Import Hashlib
Global cur
Global Conne
Def connnect_db ():
Global Conne
Global cur
Conne= MySQLdb.connect (
host= ' localhost ',
Port = 3306,
User= ' Root ',
Passwd= ' xxxx ',
db = ' lk_xxxx ',
charset = ' UTF8 ', #指定一下字符编码类型, or insert Chinese is garbled
unix_socket= "/tmp/mysql.sock" #这里php在配置里指定了, Python is estimated to have configuration, but not found, directly here to specify the
)
cur = conne.cursor () # This cursor is the cursor, the meaning of the cursor, here with the word may be a resource locator object meaning, behind the cursor to manipulate the database
def get_md5 (SRC): #相比较php直接一个md5函数求md5指, Python looks a little complicated.
MyMd5 = HASHLIB.MD5 ()
Mymd5.update (SRC)
Mymd5_digest = Mymd5.hexdigest ()
Return mymd5_digest
#保存link信息
def save_link (Url,title):
LINK_MD5=GET_MD5 (URL)
Sql= "SELECT * from lk_link WHERE link_md5 = '" +link_md5+ "'"
Print SQL
Cur.execute (SQL)
Result=cur.fetchone ()
Print result
if result = = None:
Domain=url.replace ('/http ', ')
Domain=domain.replace (' https://', ')
#类似于PHP的trim ()
Domain=domain.strip ('/');
domain=domain+ '/';
Endpos=domain.find ('/')
#这个字符截取方式看起来也是与众不同 because it's useless to a function
Domain=domain[0:endpos]
#这里注意一下 for UTF8 intercept 3 bytes is a kanji, which is related to encoding
Title=title.decode (' UTF8 ') [0:3*48].encode (' UTF8 ')
Insert_sql= "INSERT into ' lk_homepage '. ' Lk_link ' (' domain ', ' link ', ' title ', ' Link_md5 ') VALUES ('" +domain+ "', '" +url+ "' , ' "+title+" ', ' "+link_md5+" ') "
Cur.execute (Insert_sql)
#每执行一个语句, Commit, this is very consistent with MySQL operation, but in PHP may be directly encapsulated
Conne.commit ()
#获取一个网页的内容
def get_url (URL):
#php用preg_replace实现的功能, the re.sub used here, the abbreviation is too powerful, good remember also not remember
Cache_name=re.sub (' [^a-z|0-9]{1} ', ' _ ', url)
Print Cache_name
filename = cache_name+ '. txt '
If os.path.exists (filename):
F=open (filename, ' R ')
Content = F.read ()
F.close ()
print ' Read cache ' +url+ '. txt '
Else
Content=urllib.urlopen (URL). Read () # Open and read it!!
F=open (filename, ' W ') # file operation, if not present, creates
F.write (content)
F.close ()
print ' new get ' +filename+ '. txt '
return content
Base_url= ' http://xxxx_page.html '
connnect_db ()
I=1;
While i<124:
If i==1:
url = base_url.replace (' _page ', ')
Else
Url=base_url.replace (' page ', str (i))
print ' crawl, current page%d ... '% i
Content=get_url (URL)
Tmp_reg = ' righttxthead ' ><a href= ' (. *?) "
aft = Re.findall (tmp_reg,content)
For url_1 in aft:
#继续进入下层地址
Content1=get_url (' http://xxxx ' +url_1)
# Matching here took a lot of time, there is time to continue to tidy up!
#tmp_reg1 = ' <p class= ' Webintro > ([. \s\s]*) </p><ul><a href= "([^"]+] "
TMP_REG1 = ' <p class= ' Webintro ' > ([. \s\s]*?) </p><ul><a href= "([. \s\s]*?)"
#. +--this is greed. *? The non-greedy PHP words are/reg/u with whether add U to distinguish
# Print Content1
# match with FindAll, similar to PHP's preg_match_all but not the same, especially the array structure that returns the result
aft_1 = Re.findall (tmp_reg1,content1)
#!!!!! Matching results If there is a Chinese, direct print will be garbled, the same thing, to further loop to the element printing to see Chinese, which is not the same as PHP
For match in Aft_1:
TIT=MATCH[0]
URL_WEB=MATCH[1]
Save_link (Url_web,tit)
#用i + + Incredibly not, another place to remember
I+=1
print ' over! '
Cur.close ()
Conne.close ()
The above is the use of Python directly to write the crawler's learning experience, because it is directly written, in turn, to check the knowledge of Python, so this is the edge of learning, this reverse method has some advantages, but the following must be a systematic study.
Python to write a collection of example learning notes