#---------------------------------Import---------------------------------------#coding: Utf-8import urllib2;from BeautifulSoup Import beautifulsoup;#---------------------------------------------------------------------------- --def Main (): #抓 usermainurl = "Http://tieba.baidu.com/home/main?id=38b94c4ed8add8bcccabd7d31b22&fr=userbar"; #修改抓取的链接地址 req = urllib2. Request (Usermainurl); RESP = Urllib2.urlopen (req); resphtml = Resp.read (); Print "resphtml=", resphtml; #此处输出所有抓取到的HTML源码 #取 songtastehtmlencoding = "GBK"; #修改编码 <span><span class= "Attribute-name" >chars ET's format </span></span> soup = BeautifulSoup (resphtml, fromencoding=songtastehtmlencoding); Foundclassh1user = Soup.find (attrs={"target": "_blank"}); #修改抓取内容 print "foundclassh1user=%s", Foundclassh1user; if (foundclassh1user): H1userstr = foundclassh1user.string; Print "h1userstr=", h1userstr;############################################################################## #if __name__== "__main__": Main ();
Grab Category 1 Tags
#eg: Siteurls=soup.findall (' a ')
Grab Category 2 Tags
#eg: Foundclassh1user = Soup.find (attrs={"target": "_blank"});
Grab Category 2 Tags
#foundClassH1user = Soup.find (attrs={"class": "H1user"});
Python implements web crawler crawl static Web page "code"