類比登陸豆瓣
第一次登陸需要驗證碼,之後的登陸可以隱去 “login(”username’,’password’)”,因為使用session儲存了必要的登陸資訊,代碼如下:
import requeststry: import cookielibexcept: import http.cookiejar as cookielibimport reimport timeimport os.pathimport jsonfrom bs4 import BeautifulSouptry: from PIL import Imageexcept: passfrom mywordCloud import save_jieba_resultfrom mywordCloud import draw_wordcloudimport threadingimport codecs# 構造 Request headersagent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'headers = { "Host": "www.douban.com", "Referer": "https://www.douban.com/", 'User-Agent': agent,}#使用cookie登入資訊session=requests.session()session.cookies=cookielib.LWPCookieJar(filename='cookies')try: session.cookies.load(ignore_discard=True) print('成功載入cookie')except: print("cookie 未能載入")# 擷取驗證碼def get_captcha(url): #擷取驗證碼 print('擷取驗證碼',url) captcha_url = url r = session.get(captcha_url, headers=headers) print('test') with open('captcha.jpg', 'wb') as f: f.write(r.content) f.close() # 用pillow 的 Image 顯示驗證碼 # 如果沒有安裝 pillow 到原始碼所在的目錄去找到驗證碼然後手動輸入 try: im = Image.open('captcha.jpg') im.show() im.close() except: print(u'請到 %s 目錄找到captcha.jpg 手動輸入' % os.path.abspath('captcha.jpg')) captcha = input("please input the captcha\n>") return captchadef isLogin(): #登入個人首頁,查看是否登入成功 url='https://www.douban.com/people/151607908/' login_code=session.get(url,headers=headers,allow_redirects=False).status_code if login_code==200: return True else: return Falsedef login(acount,secret): douban="https://www.douban.com/" htmlcha=session.get(douban,headers=headers).text patterncha=r'id="captcha_image" src="(.*?)" alt="captcha"' httpcha=re.findall(patterncha,htmlcha) pattern2=r'type="hidden" name="captcha-id" value="(.*?)"' hidden_value=re.findall(pattern2,htmlcha) print(hidden_value) post_data = { "source": "index_nav", 'form_email': acount, 'form_password': secret } if len(httpcha)>0: print('驗證碼串連',httpcha) capcha=get_captcha(httpcha[0]) post_data['captcha-solution']=capcha post_data['captcha-id']=hidden_value[0] print (post_data) post_url='https://www.douban.com/accounts/login' login_page=session.post(post_url,data=post_data,headers=headers) #儲存cookies session.cookies.save() if isLogin(): print('登入成功') else: print('登入失敗')def get_movie_sort(): time.sleep(1) movie_url='https://movie.douban.com/chart' html=session.get(movie_url,headers=headers) soup=BeautifulSoup(html.text,'html.parser') result=soup.find_all('a',{'class':'nbg'}) print(result)#爬取短評論def get_comment(filename): #filename為爬取得內容儲存的檔案 begin=1 comment_url = 'https://movie.douban.com/subject/11600078/comments' next_url='?start=20&limit=20&sort=new_score&status=P' headers2 = { "Host": "movie.douban.com", "Referer": "https://www.douban.com/", 'User-Agent': agent, 'Connection': 'keep-alive', } f=open(filename,'w+',encoding='utf-8') while(True): time.sleep(6) html=session.get(url=comment_url+next_url,headers=headers2) soup=BeautifulSoup(html.text,'html.parser') #爬取當前頁面的所有評論 result=soup.find_all('div',{'class':'comment'}) #爬取得所有的短評 pattern4 = r'<p class=""> (.*?)' \ r'</p>' for item in result: s=str(item) count2=s.find('<p class="">') count3=s.find('</p>') s2=s[count2+12:count3] #抽取字串中的評論 if 'class' not in s2: f.write(s2) #擷取下一頁的連結 next_url=soup.find_all('div',{'id':'paginator'}) pattern3=r'href="(.*?)">後頁' if(len(next_url)==0): break next_url=re.findall(pattern3,str(next_url[0])) #得到後頁的連結 if(len(next_url)==0): #如果沒有後頁的連結跳出迴圈 break next_url=next_url[0] print('%d爬取下一頁評論...'%begin) begin=begin+1 #如果爬取了5次則多休息2秒 if(begin%6==0): time.sleep(40) print('休息...') print(next_url) f.close()#多線程爬蟲,爬取豆瓣影評def thread_get_comment(filename): next_url = '?start=19&limit=20&sort=new_score&status=P' headers2 = { "Host": "movie.douban.com", "Referer": "https://www.douban.com/", 'User-Agent': agent, 'Connection': 'keep-alive', } f = open(filename, 'w+', encoding='utf-8') comment_url = 'https://movie.douban.com/subject/26363254/comments' crawl_queue=[comment_url+next_url] crawl_queue.append('https://movie.douban.com/subject/26363254/comments?start=144&limit=20&sort=new_score&status=P') seen=set(crawl_queue) def process_queue(): begin = 1 while True: try: url=crawl_queue.pop() except IndexError: break else: time.sleep(5) html = session.get(url=url,headers=headers2) soup = BeautifulSoup(html.text, 'html.parser') # 爬取當前頁面的所有評論 result = soup.find_all('div', {'class': 'comment'}) # 爬取得所有的短評 pattern4 = r'<p class=""> (.*?)' \ r'</p>' for item in result: s = str(item) count2 = s.find('<p class="">') count3 = s.find('</p>') s2 = s[count2 + 12:count3] # 抽取字串中的評論 f.write(s2) # 擷取下一頁的連結 next_url = soup.find_all('div', {'id': 'paginator'}) pattern3 = r'href="(.*?)">後頁' if (len(next_url) == 0): break next_url = re.findall(pattern3, str(next_url[0])) # 得到後頁的連結 if (len(next_url) == 0): # 如果沒有後頁的連結跳出迴圈 break next_url = next_url[0] print('%d爬取下一頁評論...' % begin) begin = begin + 1 # 如果爬取了6次則多休息2秒 if (begin % 6 == 0): print('休息...') time.sleep(30) print(next_url) if comment_url+next_url not in seen: seen.add(comment_url+next_url) crawl_queue.append(comment_url+next_url) threads=[] max_threads=5 while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads)< max_threads and crawl_queue: thread=threading.Thread(target=process_queue) print('--------下一個線程----------') thread.setDaemon(True) # set daemon so main thread can exit when receive ctrl + C thread.start() threads.append(thread) time.sleep(2) f.close()if __name__=='__main__': if isLogin(): print('您已經登入') else: print('xs') login('dsdz@qq.com','5sdfsd6') file_name='key3.txt' get_comment(file_name) #單線程爬蟲 #thread_get_comment(file_name) #多線程爬蟲 save_jieba_result(file_name) draw_wordcloud('pjl_jieba.txt') 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190