【乾貨】python爬取《戰狼2》電影短評論,產生圖雲

來源:互聯網
上載者:User
類比登陸豆瓣

第一次登陸需要驗證碼,之後的登陸可以隱去 “login(”username’,’password’)”,因為使用session儲存了必要的登陸資訊,代碼如下:

import requeststry:    import cookielibexcept:    import http.cookiejar as cookielibimport reimport timeimport os.pathimport jsonfrom bs4 import BeautifulSouptry:    from PIL import Imageexcept:    passfrom mywordCloud import save_jieba_resultfrom mywordCloud import draw_wordcloudimport threadingimport codecs# 構造 Request headersagent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'headers = {    "Host": "www.douban.com",    "Referer": "https://www.douban.com/",    'User-Agent': agent,}#使用cookie登入資訊session=requests.session()session.cookies=cookielib.LWPCookieJar(filename='cookies')try:    session.cookies.load(ignore_discard=True)    print('成功載入cookie')except:    print("cookie 未能載入")# 擷取驗證碼def get_captcha(url):    #擷取驗證碼    print('擷取驗證碼',url)    captcha_url = url    r = session.get(captcha_url, headers=headers)    print('test')    with open('captcha.jpg', 'wb') as f:        f.write(r.content)        f.close()    # 用pillow 的 Image 顯示驗證碼    # 如果沒有安裝 pillow 到原始碼所在的目錄去找到驗證碼然後手動輸入    try:        im = Image.open('captcha.jpg')        im.show()        im.close()    except:        print(u'請到 %s 目錄找到captcha.jpg 手動輸入' % os.path.abspath('captcha.jpg'))    captcha = input("please input the captcha\n>")    return captchadef isLogin():    #登入個人首頁,查看是否登入成功    url='https://www.douban.com/people/151607908/'    login_code=session.get(url,headers=headers,allow_redirects=False).status_code    if login_code==200:        return True    else:        return Falsedef login(acount,secret):    douban="https://www.douban.com/"    htmlcha=session.get(douban,headers=headers).text    patterncha=r'id="captcha_image" src="(.*?)" alt="captcha"'    httpcha=re.findall(patterncha,htmlcha)    pattern2=r'type="hidden" name="captcha-id" value="(.*?)"'    hidden_value=re.findall(pattern2,htmlcha)    print(hidden_value)    post_data = {        "source": "index_nav",        'form_email': acount,        'form_password': secret    }    if len(httpcha)>0:        print('驗證碼串連',httpcha)        capcha=get_captcha(httpcha[0])        post_data['captcha-solution']=capcha        post_data['captcha-id']=hidden_value[0]    print (post_data)    post_url='https://www.douban.com/accounts/login'    login_page=session.post(post_url,data=post_data,headers=headers)    #儲存cookies    session.cookies.save()    if isLogin():        print('登入成功')    else:        print('登入失敗')def get_movie_sort():    time.sleep(1)    movie_url='https://movie.douban.com/chart'    html=session.get(movie_url,headers=headers)    soup=BeautifulSoup(html.text,'html.parser')    result=soup.find_all('a',{'class':'nbg'})    print(result)#爬取短評論def get_comment(filename):  #filename為爬取得內容儲存的檔案    begin=1    comment_url = 'https://movie.douban.com/subject/11600078/comments'    next_url='?start=20&limit=20&sort=new_score&status=P'    headers2 = {            "Host": "movie.douban.com",            "Referer": "https://www.douban.com/",            'User-Agent': agent,            'Connection': 'keep-alive',        }    f=open(filename,'w+',encoding='utf-8')    while(True):        time.sleep(6)        html=session.get(url=comment_url+next_url,headers=headers2)        soup=BeautifulSoup(html.text,'html.parser')        #爬取當前頁面的所有評論        result=soup.find_all('div',{'class':'comment'}) #爬取得所有的短評        pattern4 = r'<p class=""> (.*?)' \                   r'</p>'        for item in result:            s=str(item)            count2=s.find('<p class="">')            count3=s.find('</p>')            s2=s[count2+12:count3]  #抽取字串中的評論            if 'class' not in s2:                f.write(s2)        #擷取下一頁的連結        next_url=soup.find_all('div',{'id':'paginator'})        pattern3=r'href="(.*?)">後頁'        if(len(next_url)==0):            break        next_url=re.findall(pattern3,str(next_url[0]))  #得到後頁的連結        if(len(next_url)==0): #如果沒有後頁的連結跳出迴圈            break        next_url=next_url[0]        print('%d爬取下一頁評論...'%begin)        begin=begin+1        #如果爬取了5次則多休息2秒        if(begin%6==0):            time.sleep(40)            print('休息...')        print(next_url)    f.close()#多線程爬蟲,爬取豆瓣影評def thread_get_comment(filename):    next_url = '?start=19&limit=20&sort=new_score&status=P'    headers2 = {        "Host": "movie.douban.com",        "Referer": "https://www.douban.com/",        'User-Agent': agent,        'Connection': 'keep-alive',    }    f = open(filename, 'w+', encoding='utf-8')    comment_url = 'https://movie.douban.com/subject/26363254/comments'    crawl_queue=[comment_url+next_url]    crawl_queue.append('https://movie.douban.com/subject/26363254/comments?start=144&limit=20&sort=new_score&status=P')    seen=set(crawl_queue)    def process_queue():        begin = 1        while True:            try:                url=crawl_queue.pop()            except  IndexError:                break            else:                time.sleep(5)                html = session.get(url=url,headers=headers2)                soup = BeautifulSoup(html.text, 'html.parser')                # 爬取當前頁面的所有評論                result = soup.find_all('div', {'class': 'comment'})  # 爬取得所有的短評                pattern4 = r'<p class=""> (.*?)' \                           r'</p>'                for item in result:                    s = str(item)                    count2 = s.find('<p class="">')                    count3 = s.find('</p>')                    s2 = s[count2 + 12:count3]  # 抽取字串中的評論                    f.write(s2)                # 擷取下一頁的連結                next_url = soup.find_all('div', {'id': 'paginator'})                pattern3 = r'href="(.*?)">後頁'                if (len(next_url) == 0):                    break                next_url = re.findall(pattern3, str(next_url[0]))  # 得到後頁的連結                if (len(next_url) == 0):  # 如果沒有後頁的連結跳出迴圈                    break                next_url = next_url[0]                print('%d爬取下一頁評論...' % begin)                begin = begin + 1                # 如果爬取了6次則多休息2秒                if (begin % 6 == 0):                    print('休息...')                    time.sleep(30)                print(next_url)                if comment_url+next_url not in seen:                    seen.add(comment_url+next_url)                    crawl_queue.append(comment_url+next_url)    threads=[]    max_threads=5    while threads or crawl_queue:        for thread in threads:            if not thread.is_alive():                threads.remove(thread)        while len(threads)< max_threads and crawl_queue:            thread=threading.Thread(target=process_queue)            print('--------下一個線程----------')            thread.setDaemon(True) # set daemon so main thread can exit when receive ctrl + C            thread.start()            threads.append(thread)        time.sleep(2)    f.close()if __name__=='__main__':    if isLogin():        print('您已經登入')    else:        print('xs')        login('dsdz@qq.com','5sdfsd6')    file_name='key3.txt'    get_comment(file_name)        #單線程爬蟲    #thread_get_comment(file_name)  #多線程爬蟲    save_jieba_result(file_name)    draw_wordcloud('pjl_jieba.txt')
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.