Crawler python,
Recently, I saw many comments on movies. I want to know most people's comments on related movies. I am also learning Python, and I will use the powerful crawler capabilities. Here I will use Python3.6.1
The following code is used:
1 # coding: UTF-8 2 _ author _ = 'hangzhou' 3 4 import warnings 5 warnings. filterwarnings ("ignore") 6 import jieba # Word Segmentation package 7 import numpy # numpy computing package 8 import codecs # codecs provides the open Method to specify the language encoding of the opened file, it will be automatically converted to internal unicode 9 import re 10 import pandas as pd 11 import matplotlib during read. pyplot as plt 12 from urllib import request 13 from bs4 import BeautifulSoup as bs 14 # % matplotlib inline (Application in ipython) 15 # from skimage Import data 16 import matplotlib 17 matplotlib. rcParams ['figure. figsize '] = (10.0, 5.0) 18 from wordcloud import WordCloud # Word cloud package 19 20 class KetWord: 21 def _ init _ (self, name, count ): 22. self. name = name 23 self. count = count 24 25 def _ cmp _ (self, other): 26 27 if isinstance (KetWord, other): 28 if self. count> other. count: 29 return 1 30 elif self. count <other. count: 31 return-1 32 else: 33 retur N 0 34 35 def _ str _ (self): 36 return '[name =' + self. name + ': count =' + str (self. count) + '] '37 # analysis webpage function 38 def getNowPlayingMovie_list (): 39 resp = request. urlopen ('https: // movie.douban.com/nowplaying/hangzhou/') 40 html_data = resp. read (). decode ('utf-8') 41 soup = bs (html_data, 'html. parser ') 42 nowplaying_movie = soup. find_all ('div ', id = 'nowplaying') 43 nowplaying_movie_list = nowplaying_movie [0]. Find_all ('lil', class _ = 'LIST-item') 44 nowplaying_list = [] 45 for item in nowplaying_movie_list: 46 nowplaying_dict ={} 47 nowplaying_dict ['id'] = item ['data-subobject'] 48 for tag_img_item in item. find_all ('img '): 49 nowplaying_dict ['name'] = tag_img_item ['alt'] 50 nowplaying_list.append (nowplaying_dict) 51 return nowplaying_list 52 53 # crawler comment function 54 def getCommentsById (movieId, pageNum): 55 eachComm EntList = []; 56 if pageNum> 0: 57 start = (pageNum-1) * 20 58 else: 59 return False 60 requrl = 'https: // movie.douban.com/subject/' + movieId + '/comments' + '? '+ 'Start =' + str (start) + '& limit = 20' 61 print (requrl) 62 resp = request. urlopen (requrl) 63 html_data = resp. read (). decode ('utf-8') 64 soup = bs (html_data, 'html. parser ') 65 comment_div_lits = soup. find_all ('div ', class _ = 'comment') 66 for item in comment_div_lits: 67 if item. find_all ('P') [0]. string is not None: 68 eachCommentList. append (item. find_all ('P') [0]. string) 69 return eachCommentList 70 71 Def main (): 72 # obtain the first 10 Comments of the first movie in a loop 73 commentList = [] 74 NowPlayingMovie_list = getNowPlayingMovie_list () 75 print ('common = ', NowPlayingMovie_list) 76 # obtain the id movie [{'id': '123456', 'name': 'starcraft agent: Qixing City'}, {'id': '123 ', 'name': 'Vehicle leech'}, {'id': '000000', 'name': 'Vehicle overall mobilization 3: Extreme challenging '}, 77 # {'id': '000000', 'name': 'dunkerker'}, {'id': '000000', 'name': 'Wolf 2 '}, {'id': '1234568', 'name': 'Wolf-kill-Greedy Wolf '}, {'id ':' 26816086 ', 'name': 'silver soul version'}, 78 # {'id': '123', 'name': '22'}, {'id ': '000000', 'name': '000000'}, {'id': '000000', 'name': 'black and white maze '}, {'id ': '123', 'name': 'Earth: A magical Day'}, 79 # {'id': '123', 'name': 'sail movie 6: invincible saints '}, {'id': '123', 'name': 'manchester by the sea'}, {'id': '123', 'name ': 'break · authorization'}, 80 # {'id': '000000', 'name': 'second first love'}, {'id': '000000 ', 'name': 'Food preferences with big ears fig'}, {'id ':' 25857966 ', 'name': 'zhuzhu chuan'}, {'id': '000000', 'name': 'Psychological sins'}, 81 # {'id ': '123456', 'name': ''}, {'id': '123456', 'name': 'sansheng 3rd Shili peach blossom'}, {'id ': '000000', 'name': 'seven days'}, {'id': '000000', 'name':' '}, 82 # {'id ': '000000', 'name': 'Grand Theft consortium '}, {'id': '000000', 'name': 'Flash girly'}, {'id ': '000000', 'name': 'terrorism graduation photo 2'}, {'id': '000000', 'name': 'God stealing dad 3 '}, 83 # {'id': '000000', 'name': 'Lili Niang '}] 84 for I in range (10): 85 num = I + 1 86 commentList_temp = getCommentsById (NowPlayingMovie_list [4] ['id'], num) 87 commentList. append (commentList_temp) 88 89 # convert the data in the list to a string of 90 comments = ''91 for k in range (len (commentList )): 92 comments = comments + (str (commentList [k]). strip () 93 94 # Remove punctuation 95 pattern = re using regular expressions. compile (R' [\ u4e00-\ u9fa5] + ') 96 filterdata = re. findall (pattern, comments) 97 Cleaned_comments = ''. join (filterdata) 98 99 # Use the jieba word segmentation for Chinese Word Segmentation 100 segment = jieba. lcut (cleaned_comments) 101 words_df = pd. dataFrame ({'segment': segment}) 102 103 # Remove the deprecated word 104 stopwords = pd. read_csv ("stopwords.txt", index_col = False, quoting = 3, sep = "\ t", names = ['stopword'], encoding = 'utf-8 ') # quoting = 3 do not reference 105 words_df = words_df [~ Words_df.segment.isin (stopwords. stopword)] 106 107 # Word Frequency 108 words_stat = words_df.groupby (by = ['segment ']) ['segment']. counts ({"count": numpy. size) 109 words_stat = words_stat.reset_index (). sort_values (by = ["count"], ascending = False) 110 111 # Word cloud display 112 wordcloud = WordCloud (font_path = "simhei. ttf ", background_color =" white ", max_font_size = 80) 113 word_frequence = {x [0]: x [1] for x in words_stat.head (1000 ). values} 114 115 # exploitation word 116 word_frequence_list = {} 117 x_val = [] 118 y_val = [] 119 for key in word_frequence: 120 word_frequence_list [str (key)] = word_frequence [key] 121 122 wordcloud = wordcloud. generate_from_frequencies (word_frequence_list) 123 print (word_frequence_list) 124 125 # print ('X = ', x_val) 126 # print ('y =', y_val) 127 # map = dict () 128 # for I in range (len (y_val): 129 ## key_word = KetWord (x_val [I], y_val [I]) 130 # Map [I] = KetWord (x_val [I], y_val [I]) 131 # for key in map: 132 # print ('word = ', map [key]) 133 # plt. plot (x_val, y_val) 134 # plt. show () 135 plt. imshow (wordcloud) 136 # since it is a built-in magic function of IPython, It is not supported in Pycharm. However, we can work on pyplot in matplotlib. pyplot does not provide image display functions. 137 plt. colorbar () 138 plt. show () 139 140 # main Function 141 main ()