Bs + json Parsing
#-*-Coding = UTF-8-*-import urllib2from BeautifulSoup import BeautifulSoup as bs3import jsonimport codecs # character detection, used to detect the actual encoding format import chardet # save content to filedef save_to_file (filename, content): f = open (filename, 'W + ') assert (f) f. write (content) f. close () def parse_key_link (content): old_code_name = chardet. detect (content) ['encoding'] print ('old _ code_name [key_link] = % s' % (old_code_name,) js = json. loads (content. decode (old_code_name) for I in js ['result'] ['items ']: print ('name = % s, link = % s' % (I ['name']. encode (old_code_name), I ['link']. encode (old_code_name) def parse_config (content): old_code_name = chardet. detect (content) ['encoding'] print ('old _ code_name [config] = % s' % (old_code_name,) js = json. loads (content. decode (old_code_name) for I in js ['result'] ['paramtypeitems ']: print ('name = % s' % (I ['name']. encode (old_code_name),) i1 = I ['paramitems '] for j in i1: print ('name = % s' % (j ['name']. encode (old_code_name),) j1 = j ['valueitems '] for k in j1: print ('specid = % d, value = % s' % (k ['specid'], k ['value']. encode (old_code_name) def parse_option (content): old_code_name = chardet. detect (content) ['encoding'] print ('old _ code_name [option] = % s' % (old_code_name,) js = json. loads (content. decode (old_code_name) for I in js ['result'] ['configtypeitems ']: print ('name = % s' % (I ['name']. encode (old_code_name),) i1 = I ['configitems '] for j in i1: print ('name = % s' % (j ['name']. encode (old_code_name),) j1 = j ['valueitems '] for k in j1: print ('specid = % d, value = % s' % (k ['specid'], k ['value']. encode (old_code_name) def parse_color (content): old_code_name = chardet. detect (content) ['encoding'] print ('old _ code_name [color] = % s' % (old_code_name,) js = json. loads (content. decode (old_code_name) for I in js ['result'] ['specitems ']: print ('specid = % d' % (I ['specid'],) i1 = I ['coloritems '] for j in i1: print ('Id = % d, name = % s, value = % s, picnum = % d' % (j ['id'], j ['name']. encode (old_code_name), j ['value']. encode (old_code_name), j ['picnum']) def parse_innerColor (content): old_code_name = chardet. detect (content) ['encoding'] print ('old _ code_name [innerColor] = % s' % (old_code_name,) js = json. loads (content. decode (old_code_name) for I in js ['result'] ['specitems ']: print ('specid = % d' % (I ['specid'],) i1 = I ['coloritems '] for j in i1: j1 = j ['values'] for k in j1: print ('Id = % d, name = % s, value = % s, picnum = % d' % (j ['id'], j ['name']. encode (old_code_name), k. encode (old_code_name), j ['picnum']) def parse_json_data (content): name_list = ['keylink', 'config', 'option', 'color ', 'innercolor'] parse_list = [parse_key_link, parse_config, parse_option, parse_color, parse_innerColor] assert (len (content) = len (parse_list) for I in range (len (content )): parse_list [I] (content [I]) def parse_content (content): # content is the GB2312 encoding soup = bs3 (content) key_text = 'var levelid' elem _ lib = soup. find ('script', text = lambda (x): key_text in x) # str_script is UTF-8 encoded str_script = str (elem_lib.string) # print (chardet. detect (str_script) # because the command line is cp936 GBK encoding, if the encoding does not match, strGBK = str_script.decode ('utf-8') cannot be printed '). encode ('gb2312') # print (strGBK) # Remove the html Escape Character strGBK = strGBK. replace ('','') d = strGBK. splitlines () list_data = [] for I in d: if I. isspace (): continue # filter unnecessary variables if len (I) <100: continue # retrieve json data idx = I. find ('{') if idx =-1: continue # remove the final; k = I [idx:-1] list_data.append (k) parse_json_data (list_data) def crawler_4_autohome (): autohome_url = 'HTTP: // car.autohome.com.cn/config/series/657.html'?u=8content = urllib2.urlopen (url = autohome_url ). read () # print (chardet. detect (content) parse_content (content) if _ name _ = '_ main _': crawler_4_autohome ()