Obtain administrative divisions in the latest China and administrative divisions in China
To put it bluntly, go to the Code:
import urlparsefrom StringIO import StringIOimport datetimeimport requestsimport lxmlfrom lxml import etreedef get_latest_url(index_url): response=requests.get(index_url) parser=etree.HTMLParser() tree = etree.parse(StringIO(response.content ), parser) r = tree.xpath('//ul[@class="center_list_contlist"]') if len(r)==1: div=r[0] href = div.xpath('li/a/@href')[0] return urlparse.urljoin(index_url,href) else: return Nonedef get_xingzhengquhua_text(latest_url, referer=None): response=requests.get(latest_url) parser= etree.HTMLParser() tree = etree.parse(StringIO(response.content ), parser) r = tree.xpath('//div[@class="xilan_con"]') print r if len(r)==1: div=r[0] div2 = div.xpath('div/div')[0] div3 = div2.xpath('.//p') p=[] for line in div3: #line = line.replace(u'\xa0', u' ').strip() #if not line: # continue try: code=line.xpath('span[1]/text()')[0] name=line.xpath('span[2]/text()')[0].strip(u'\u3000') except: continue if code.endswith('0000'): parent='' elif code.endswith('00'): parent=code[:2]+'0000' else: parent=code[:4]+'00' p.append((parent,code,name)) text='\n'.join(map(lambda x:','.join(x),p)) text=text.encode('utf-8') print text return text else: text=Noneif __name__ == '__main__': index_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/' #latest_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201401/t20140116_501070.html' latest_url=get_latest_url(index_url) print latest_url if latest_url: text=get_xingzhengquhua_text(latest_url) filename=latest_url.strip().split('/')[-1] print filename try: filename=filename.split('_')[0][1:] except: now=datetime.datetime.now() filename=now.strftime('%Y-%m-%d') if text: ff=open('latest-xingzhengquhua-%s.txt' % filename,'w') ff.write(text) ff.close() else: print 'Failed get xingzhengquehua data!' else: print 'Failed get latest data url'