標籤:decode 指定 com pat port ubi 企業 txt content
#-*- coding:utf-8 -*- #讀取北京FDA的藥品經營企業資料# 20161125 zhangshaohuaimport reimport urllib.requestimport urllib.parseimport osdef getContent(url,pat,charSet): #指定網址、Regex、編碼方式,返回指定內容 page = urllib.request.urlopen(url) content = page.read().decode(charSet) pattern = re.compile(pat) result = re.findall(pattern,content) return result#讀取首頁url = ‘http://www.bjda.gov.cn/eportal/ui?pageId=331148‘#取總記錄數,每頁20條zjls = getContent(url,‘總記錄數:(\d{1,5}),‘,‘UTF-8‘)vdzjls = int(zjls[0])vdzjls = int(round(vdzjls/20,0)) for i in range(51,vdzjls): url = ‘http://www.bjda.gov.cn/eportal/ui?pageId=331148¤tPage=‘+str(i) pattern = ‘artileId=(.*)">查看‘ page_id = getContent(url,pattern,‘UTF-8‘) for url_id in page_id: try: subid = url_id suburl = "http://www.bjda.gov.cn/eportal/ui?pageId=331631&artileId="+subid qymc = getContent(suburl,‘企業名稱:</th>\r\n.*?<td>(.*?)</td>‘,‘UTF-8‘) zcdz = getContent(suburl,‘登入位址:</th>\r\n.*?<td>(.*?)\s{0,3}</td>‘,‘UTF-8‘) xkzh = getContent(suburl,‘許可證號:</th>\r\n.*?<td>(.*?)</td>‘,‘UTF-8‘) print(qymc,zcdz,xkzh) file_object = open(‘bjda.txt‘,‘a‘) file_object.write(qymc[0]) file_object.write(‘,‘) file_object.write(zcdz[0]) file_object.write(‘,‘) file_object.write(xkzh[0]) file_object.write(‘\n\r‘) finally: None file_object.close()vdzjls = int(zjls[0])print(‘藥品零售企業讀取完成!‘)
經曆了讀取HDA的練習,此次讀取BJ的資料開始比較順暢。在讀取996條資料時出錯,再次出現換行造成的問題;
多次試錯後用‘\s{0,3}’成功解決.
Regex要繼續學習,才能不斷進步,避免遇“”坑“”時能順利通過!
python3讀取BJDA藥品經營企業資料