#! /Usr/ENV Python #-*-coding: UTF-8-*-import urllib import urllib2 import random import requestsimport OS, sys import mysqldbfrom sgmllib import sgmlparser from beautifulsoup import beautifulsoupimport renum = 0def main (): Try: conn = mysqldb. connect (host = 'localhost', user = 'root', passwd = '000000', DB = 'googlemarket', charset = "utf8") Conn. query ("set names utf8") failed t exception, E: Print esys. exit () cursor = Conn. curs Or () Category = ['personalization', 'transport', 'Sports ', 'health _ and_fitness', 'app _ wallpaper ', 'comics', 'Medical ', 'business', 'books _ and_reference ', 'weate', 'entertainment', 'Media _ and_video ', 'app _ widgets', 'tools', 'Photography ', 'producification', 'education', 'news _ and_magazines ', 'travel _ and_local', 'lifestyle', 'social', 'Finance ', 'shopping ', 'libraries _ and_demo ', 'communication', 'music _ and_audio ', 'game'] F Or k in range (): t = "https://play.google.com/store/apps/category/" + category [k] html = requests. get (t) preresult = html. contentsoup = beautifulsoup (preresult) Result = soup. prettern ("UTF-8") pattern = Re. compile ('<a class = "title" href = "(. + ?) "Title ') dataresult = Re. findall (pattern, result) dataresult = List (SET (dataresult) for I in dataresult: url = "https://play.google.com" + iPrint URL # url = "https://play.google.com/store/apps/details? Id = com. androidesk & HL = zh_cnhttps % 3A % 2f % 2fplay.google.com % 2 fstore % 2 fapps % 2 fdetails % 3fid % 3dcom. androidesk "html = requests. get (URL) preresult = html. contentsoup = beautifulsoup (preresult) Result = soup. prettern ("UTF-8") # Name: pattern = Re. compile ('<Div class = "document-title" itemprop = "name"> [\ s] *? <Div> ([\ s] *?) </Div> ') data0 = Re. findall (pattern, result) For items in data0: print items # manufacturer pattern = Re. compile ('itemprop = "name"> ([\ s] *?) </A> ') data1 = Re. findall (pattern, result) Make = data1 [0]. split ("\ n") print make [8] # version pattern = Re. compile ('itemprop = "softwareversion"> ([\ s] *?) </Div> ') data2 = Re. findall (pattern, result) print data2 [0] # Update time pattern = Re. compile ('itemprop = "datepublished"> ([\ s] *?) </Div> ') data3 = Re. findall (pattern, result) print data3 [0] # file size pattern = Re. compile ('itemprop = "filesize"> ([\ s] *?) </Div> ') data4 = Re. findall (pattern, result) print data4 [0] # supports firmware pattern = Re. compile ('itemprop = "operatingsystems"> ([\ s] *?) </Div> ') data5 = Re. findall (pattern, result) print data5 [0] # description pattern = Re. compile ('itemprop = "Description"> [\ s] *? <Div> ([\ s] *?) </Div> ') data6 = Re. findall (pattern, result) For items in data6: Print re. sub ('[<br/> <p> </P>]', '', items) SQL =" insert into address (name, version, developer, pubtime, filesize, support, introduction) values (% s, % s) "for items in data6: If (data5 ): # values = (data0 [0], data1 [0], data2 [0], data3 [0], data4 [0], data5 [0], re. sub ('<br/>', '', items) # else: # values = (data0 [0], data1 [0], data2 [0], data3 [0], data4 [0], 'nul L', re. sub ('<br/>', '', items) # print values # print SQL % values#cursor.exe cute (SQL, values) # Conn. commit () pattern = Re. compile (' ') Data = Re. findall (pattern, result) Global numfor J in data: Print jprint type (j) headers = {'user-agent': 'mozilla/5.0 (windows; U; Windows NT 6.1; RV: 2.2) Gecko/20110201 '} temp = requests. get (J [1:-2], headers = headers) F = file ("googlemarket/" + STR (Num), "W + ") num = num + 1 print numf. write (temp. content) If _ name __= = "_ main _": Main ()
<Type 'str'>
Traceback (most recent call last ):
File "crawler0729.py", line 103, in <module>
Main ()
File "crawler0729.py", line 91, in Main
Temp = requests. Get (J [1:-2], headers = headers)
File "/usr/local/lib/python2.7/dist-packages/requests/API. py", line 55, in get
Return request ('get', URL, ** kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/API. py", line 44, in request
Return session. Request (method = method, url = URL, ** kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions. py", line 335, in request
Resp = self. Send (Prep, ** send_kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions. py", line 438, in send
R = Adapter. Send (request, ** kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/adapters. py", line 327, in send
Raise connectionerror (E)
Requests. exceptions. connectionerror: httpsconnectionpool (host = 'lh3 .ggpht.com ', Port = 443): Max retries exceeded with URL:/timeout = w30 (caused by <class 'socket. error '>: [errno 101] network is unreachable)