bs+json解析,bsjson

來源:互聯網
上載者:User

bs+json解析,bsjson

# -*- coding=utf-8 -*-import urllib2from BeautifulSoup import BeautifulSoup as bs3import jsonimport codecs#字元檢測,用來檢測其真實的編碼格式import chardet#save content to filedef save_to_file(filename, content):f = open(filename, 'w+')assert(f)f.write(content)f.close()def parse_key_link(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[key_link]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['items']:print('name=%s, link=%s' % (i['name'].encode(old_code_name),i['link'].encode(old_code_name)))def parse_config(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[config]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['paramtypeitems']:print('name=%s' % (i['name'].encode(old_code_name),))i1 = i['paramitems']for j in i1:print('  name=%s' % (j['name'].encode(old_code_name),))j1 = j['valueitems']for k in j1:print('    specid=%d,value=%s' % (k['specid'],k['value'].encode(old_code_name)))def parse_option(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[option]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['configtypeitems']:print('name=%s' % (i['name'].encode(old_code_name),))i1 = i['configitems']for j in i1:print('  name=%s' % (j['name'].encode(old_code_name),))j1 = j['valueitems']for k in j1:print('    specid=%d,value=%s' % (k['specid'],k['value'].encode(old_code_name)))def parse_color(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[color]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['specitems']:print('specid=%d' % (i['specid'],))i1 = i['coloritems']for j in i1:print('  id=%d,name=%s,value=%s,picnum=%d' % \(j['id'],j['name'].encode(old_code_name),j['value'].encode(old_code_name),j['picnum']))def parse_innerColor(content):old_code_name = chardet.detect(content)['encoding']print('old_code_name[innerColor]=%s' % (old_code_name,))js = json.loads(content.decode(old_code_name))for i in js['result']['specitems']:print('specid=%d' % (i['specid'],))i1 = i['coloritems']for j in i1:j1 = j['values']for k in j1:print('  id=%d,name=%s,value=%s,picnum=%d' % \(j['id'],j['name'].encode(old_code_name),k.encode(old_code_name),j['picnum']))def parse_json_data(content):name_list = ['keyLink', 'config', 'option','color', 'innerColor']parse_list = [parse_key_link, parse_config, parse_option, parse_color, parse_innerColor]assert(len(content) == len(parse_list))for i in range(len(content)):parse_list[i](content[i])def parse_content(content):#content是GB2312的編碼soup = bs3(content)key_text = 'var levelId'elem_lib = soup.find('script', text=lambda(x):key_text in x)#str_script是utf-8的編碼str_script = str(elem_lib.string)#print(chardet.detect(str_script))#由於命令列是cp936 GBK的編碼,如果編碼不符合無法列印strGBK = str_script.decode('utf-8').encode('gb2312')#print(strGBK)#移除html的逸出字元 strGBK = strGBK.replace(' ','')d = strGBK.splitlines()list_data = []for i in d:if i.isspace():continue#過濾不需要的變數if len(i) < 100:continue#取出json資料idx = i.find('{')if idx == -1:continue#移除最後的;k = i[idx:-1]list_data.append(k)parse_json_data(list_data)def crawler_4_autohome():autohome_url = 'http://car.autohome.com.cn/config/series/657.html'#uft-8content = urllib2.urlopen(url=autohome_url).read()#print(chardet.detect(content))parse_content(content)if __name__ == '__main__':crawler_4_autohome()



源碼:

http://download.csdn.net/detail/davidsu33/8447189

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.