To analyze who is the king of water stickers, the first to collect posts and posters of data.
here to test Baidu Bar Liyi The first 100 pages:
#coding: Utf-8 import urllib2 from BS4 import beautifulsoup import CSV import re import sys reload (SYS) sys.setdefaultencod ing (' utf-8 ') #wb写 A + append mode for K in range (0,100): req = urllib2. Request (' http://tieba.baidu.com/f?kw= liyi &ie=utf-8&pn= ' +str (k*50)) CSVFile = File (' tiezi.csv ', ' ab+ ') writer = Csv.writer (CSVFile) #writer. Writerow ([' 1 ', ' 2 ', ' 3 ', ' 4 ']) response = Urllib2.urlopen (req) the_page = Respon Se.read () soup = BeautifulSoup (the_page, "lxml") list1=[] list2=[] list3=[] list4=[] list5=[] F or tag in Soup.find_all (name= "A", attrs={"class": Re.compile ("J_th_tit")}): List1.append ("http://tieba.baidu.com" +t ag[' href '] list2.append (tag.string) for tag in Soup.find_all (name= "span", attrs={"class": Re.compile ("Threadl Ist_rep_num.* ")}): List3.append (tag.string) for tag in Soup.find_all (name=" span ", attrs={" class ": Re.compile (" tb_icon_author$ ")}): List4.append (tag[' title ')] for tag in Soup.find_all (name= "span", attrs={"class": Re.compile ("tb_icon_author_rely")}): List5.append (tag[' title ')] data=[] For I in range (0,len soup.find_all (name= "A", attrs={"class": Re.compile ("J_th_tit"))): Data.append ((list1[i],list
2[i],list3[i],list4[i]) writer.writerows (data) csvfile.close () print "subclause" +str (k) + "page Complete"
It takes about two minutes to get a 5000-line CSV file that you can use to open the view in Excel.
About Code: Setting encoding
Import sys
Reload (SYS)
sys.setdefaultencoding (' Utf-8 ')
Construct data and write to CSV:
Data=[]
for I in range (0,len soup.find_all (name= "A", attrs={"class": Re.compile ("J_th_tit"))):
Data.append ((list1[i],list2[i],list3[i],list4[i))
writer.writerows (data)
Use BS4:
For tag in Soup.find_all (name= "span", attrs={"class": Re.compile ("tb_icon_author$")}):
#查找span标签 and class regular expression matching