bs+json解析,bsjson
# -*- coding=utf-8 -*-import urllib2from BeautifulSoup import BeautifulSoup as bs3import jsonimport codecs#字元檢測,用來檢測其真實的編碼格式import chardet#save content to filedef save_to_file(filename, content):f = open(filename, 'w+')assert(f)f.write(content)f.close()def parse_key_link(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[key_link]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['items']:print('name=%s, link=%s' % (i['name'].encode(old_code_name),i['link'].encode(old_code_name)))def parse_config(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[config]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['paramtypeitems']:print('name=%s' % (i['name'].encode(old_code_name),))i1 = i['paramitems']for j in i1:print(' name=%s' % (j['name'].encode(old_code_name),))j1 = j['valueitems']for k in j1:print(' specid=%d,value=%s' % (k['specid'],k['value'].encode(old_code_name)))def parse_option(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[option]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['configtypeitems']:print('name=%s' % (i['name'].encode(old_code_name),))i1 = i['configitems']for j in i1:print(' name=%s' % (j['name'].encode(old_code_name),))j1 = j['valueitems']for k in j1:print(' specid=%d,value=%s' % (k['specid'],k['value'].encode(old_code_name)))def parse_color(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[color]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['specitems']:print('specid=%d' % (i['specid'],))i1 = i['coloritems']for j in i1:print(' id=%d,name=%s,value=%s,picnum=%d' % \(j['id'],j['name'].encode(old_code_name),j['value'].encode(old_code_name),j['picnum']))def parse_innerColor(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[innerColor]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['specitems']:print('specid=%d' % (i['specid'],))i1 = i['coloritems']for j in i1:j1 = j['values']for k in j1:print(' id=%d,name=%s,value=%s,picnum=%d' % \(j['id'],j['name'].encode(old_code_name),k.encode(old_code_name),j['picnum']))def parse_json_data(content):name_list = ['keyLink', 'config', 'option','color', 'innerColor']parse_list = [parse_key_link, parse_config, parse_option, parse_color, parse_innerColor]assert(len(content) == len(parse_list))for i in range(len(content)):parse_list[i](content[i])def parse_content(content):#content是GB2312的編碼soup = bs3(content)key_text = 'var levelId'elem_lib = soup.find('script', text=lambda(x):key_text in x)#str_script是utf-8的編碼str_script = str(elem_lib.string)#print(chardet.detect(str_script))#由於命令列是cp936 GBK的編碼,如果編碼不符合無法列印strGBK = str_script.decode('utf-8').encode('gb2312')#print(strGBK)#移除html的逸出字元 strGBK = strGBK.replace(' ','')d = strGBK.splitlines()list_data = []for i in d:if i.isspace():continue#過濾不需要的變數if len(i) < 100:continue#取出json資料idx = i.find('{')if idx == -1:continue#移除最後的;k = i[idx:-1]list_data.append(k)parse_json_data(list_data)def crawler_4_autohome():autohome_url = 'http://car.autohome.com.cn/config/series/657.html'#uft-8content = urllib2.urlopen(url=autohome_url).read()#print(chardet.detect(content))parse_content(content)if __name__ == '__main__':crawler_4_autohome()
源碼:
http://download.csdn.net/detail/davidsu33/8447189