1 #python2 #-*-coding:utf-8-*-3 """4 __title__ = ' '5 __author__ = ' WLC '6 __mtime__ = ' 2017/10/15 '7 """8 ImportRe9 Import TimeTen ImportMath One ImportCSV A ImportRequests - fromBs4ImportBeautifulSoup - fromCollectionsImportdeque the - #Create a CSV file to save information -Path ='Datacollection/userinfo.csv' -CSVFile = open (path,'A +', newline="', encoding='Utf-8') +writer =Csv.writer (csvfile) -Writer.writerow (('ID','name','following','Follower','article','Word',' like')) + A #Global variables are used to store the UserID and the number of followers atIdcontainer =set () - #the link used to place the user uses a two-way queue -Linkdeque =deque () - - classJianshu (object): - def __init__(self): in #Customizing URL templates -Self.url ='Http://www.jianshu.com/users/{userid}/following?page={page}' to #match rule for user ID and name +Self.idpattern = Re.compile ('<a class= "name" href= "/u/(. *?)" > (. *?) </a>') - #The matching rules of the user's concern fan article anthology theSelf.metalpattern = Re.compile ('<span> Follow (\d+) </span><span> fans (\d+) </span><span> articles (\d+) </span>') *Self.meta = Re.compile ('wrote (\d+) the word, obtained (\d+) a like') $ #disguised as a browserPanax NotoginsengSelf.header = {"user-agent":"mozilla/5.0 (Windows NT 10.0; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/61.0.3163.100 safari/537.36"} - the defcreaterequest (self, UserId, page): +url = self.url.format (userid = userid, page =page) Arequ = Requests.get (url, headers =self.header). Text the returnrequ + - defpageresponse (Self, requ): $Bsobj = BeautifulSoup (requ,'lxml') $Usercontainer = Bsobj.find_all ('ul',{'class':'user-list'}) [0] -Usercontent =usercontainer.contents -Usercontent = [Str (user) forUserinchUsercontentifUser! ='\ n'] the #Follow user list - returnusercontentWuyi the defParseruserinfo (self, user): -ID, name =Re.findall (self.idpattern, user) [0] WuFollowingnum, followernum, articlenum =Re.findall (self.metalpattern, user) [0] -Wordnum, Likenum =Re.findall (Self.meta, user) [0] #此处如果出现index out of range then try except can sometimes get a value of NULL AboutContent =(ID, name, followingnum, Followernum, Articlenum, Wordnum, Likenum) $ writer.writerow (content) - returncontent - - defgetuserlist (self, userId, following): A Idcontainer.add ((userId, following)) +num = Int (following)/10 thepage =Math.ceil (num) - forPginchRange (1, page + 1, 1): $requ =Self.createrequest (UserId, PG) theUserList =self.pageresponse (requ) the forUserinchuserlist: theContent =self.parseruserinfo (user) theLinkdeque.append (Content[0], content[2])) -Time.sleep (1) in forDeqinchLinkdeque: the ifDeq not inchIdcontainer: theSelf.getuserlist (deq[0],deq[1]) About Print(" What") theJianshu = Jianshu (). Getuserlist ('652fbdd1e7b3', 162)
Python Simple book user crawler