From BS4 import BeautifulSoup
From urllib.request import Urlopen
Import re
Import Random
Base_url = "Https://baike.baidu.com"
#导入相关的包
his = ["/item/%e7%bd%91%e7%bb%9c%e7%88%ac%e8%99%ab/5162711"]
#初始化url
#循环选取20百度百科的数据
For I in range (20):
url = base_url + his[-1]
#组合url
html = urlopen (URL). read (). Decode (' Utf-8 ')
#获取网页内容
Soup = BeautifulSoup (html, features= ' lxml ')
#beautifulsoup通过lxml显示解析网页
print(i, soup.find(‘h1‘).get_text(), ‘ url: ‘, base_url+his[-1])#将以下信息打印出来sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})#通过正则表达式,首先找到a标签,然后选取含有target的内容,并且href 她的必须匹配以/item/开头的形式if len(sub_urls) != 0: his.append(random.sample(sub_urls, 1)[0][‘href‘]) #通过random的sample方法从sub-url中水机选去一个长度为一的list的a标签,然后选区他的href属性else: # no valid sub link found his.pop() #如果当前没有链接,退出再来,然后再选择一个,在来 ![](http://i2.51cto.com/images/blog/201803/27/2ec8773ff147c38305ae581297c51351.png?x-oss-process=image/watermark,size_16,text_QDUxQ1RP5Y2a5a6i,color_FFFFFF,t_100,g_se,x_10,y_10,shadow_90,type_ZmFuZ3poZW5naGVpdGk=)
The most simple Python crawler tutorial-crawl Baidu Encyclopedia case