標籤:pytho 網頁 findall key java html解析 函數 utf-8 api
* BeautifulSoup 的.find(), .findAll() 函數原型
findAll(tag, attributes, recursive, text, limit, keywords)find(tag, attributes, recursive, text, keywords)
* 取得 span.green
bsObj.findAll("span", {"class":"green"})
#-*- coding: UTF-8 -*-#!/usr/local/bin/pythonfrom urllib.request import urlopenfrom urllib.request import HTTPError, URLErrorfrom bs4 import BeautifulSoupdef getBsObj(url): try: html = urlopen(url, None, 3) except(HTTPError, URLError) as e: print(e) return None try: bsObj = BeautifulSoup(html.read(), "html.parser") except AttributeError as e: return None return bsObjbsObj = getBsObj("http://www.pythonscraping.com/pages/warandpeace.html")nameList = bsObj.findAll("span", {"class":"green"})for name in nameList: print(name.get_text())
* 取得 h1,h2,h3,h4,h5,h6
bsObj.findAll({"h1","h2","h3","h4","h5","h6"});
// javascript 產生引號 包裹每個元素的字串
function quote(s) { return "\"" + s.split(",").join("\",\"") + "\"";}var s = "h1,h2,h3,h4,h5,h6"console.log(quote(s))
* 取得 span.green, span.red
bsObj.findAll("span", {"class":{"green", "red"}})
* 取得網頁中包含"the prince"內容的標籤數量
nameList = bsObj.findAll(text="the prince")print(len(nameList))
* 找到#text id="text"
allText = bsObj.find(id="text")print(allText.get_text())
* 找到div#text
allText = bsObj.find("div", {"id":"text"})
* 找到div#text > span.red:first-child
red = bsObj.find("div", {"id":"text"}).find("span", {"class":"red"}, False)print(red.get_text())
python BeautifulSoup html解析