Import urllib2
Import requests
# Import mysqldb
Import webbrowser
Import string
Import re
From beautifulsoup import beautifulsoup
Def gethtml (page): # obtain the URL content
Page = STR (page)
Html = requests. Get ("http://search.sina.com.cn /? Q = % BD % F0 % D0 % E3 % CF % Cd & range = All & C = News & sort = Time & page = "+ page). Text
Return html
Def getpage (): # obtain the total number of webpages
Html = requests. Get ("http://search.sina.com.cn /? Range = All & C = News & Q = % BD % F0 % D0 % E3 % CF % Cd & from = home "). Text # URL
Soup = beautifulsoup (''. Join (HTML ))
A = soup ('div ', {'class': 'l _ v2 '})
Race = []
C = ""
Race = STR (a). Split ("news") [1]. Split ("article") [0]. Split (",") # obtain the website page number
B = Len (Race)
For I in range (B ):
C + = race [I]
B = string. atoi (C)/20
Return B
Def getcontents (HTML): # Get the specified news content
Soup = beautifulsoup (''. Join (HTML ))
Rs = Re. Compile ("fgray_time ")
Html = soup. findall ('span ', attrs = {'class': Rs })
Rs = Re. Compile ("Box-result Clearfix ")
Contents = soup. findall ('div ', attrs = {'class': Rs })
For C in HTML:
Length = Len (C. Text. Split (''))
If length = 3:
Source = C. Text. Split ('') [0] # news source
Time = C. Text. Split ('') [1] +'' + C. Text. Split ('') [2] # News posting time
Print Source
Print time
Else:
Time = C. Text # News posting time
Source = ''# news source
Print time
For I in contents:
Title = I. h2.a. Text # News Title
Content = I. p. Text # OVERVIEW
# Print html
If _ name __= = "_ main __":
Count = getpage ()
Print 111
For I in range (count ):
Print getcontents (gethtml (I ))
Print 222
Sina news crawls instances by keyword