Recently want to see a film, to know the review, just learning Python crawler, do a small example.
Code modified based on third-party source link http://python.jobbole.com/88325/#comment-94754
#Coding:utf-8 fromLib2to3.pgen2.grammarImport Line__author__=' Hang'ImportWarningswarnings.filterwarnings ("Ignore")ImportJieba#Word breaker PackageImportNumPy#NumPy Calculation PackageImportReImportPandas as PDImportMatplotlib.pyplot as PltImportUrllib2 fromBs4ImportBeautifulSoup as BSImportmatplotlibmatplotlib.rcparams['figure.figsize'] = (10.0, 5.0) fromWordcloudImportWordcloud#Word Cloud Pack#analyze Web page functionsdefgetnowplayingmovie_list (): Resp= Urllib2.urlopen ('https://movie.douban.com/nowplaying/hangzhou/') Html_data= Resp.read (). Decode ('Utf-8') Soup= BS (Html_data,'Html.parser') Nowplaying_movie= Soup.find_all ('Div', id='nowplaying') Nowplaying_movie_list= Nowplaying_movie[0].find_all ('Li', class_='List-item') Nowplaying_list= [] forIteminchnowplaying_movie_list:nowplaying_dict={} nowplaying_dict['ID'] = item['Data-subject'] forTag_img_iteminchItem.find_all ('img'): nowplaying_dict['name'] = tag_img_item['alt'] Nowplaying_list.append (nowplaying_dict)returnnowplaying_list#Crawl Comment FunctiondefGetcommentsbyid (MovieID, pagenum): Eachcommentstr="' ifPagenum>0:start= (pageNum-1) * 20Else: returnFalse Requrl='https://movie.douban.com/subject/'+ MovieID +'/comments'+'?'+'start='+ str (START) +'&limit=20' Print(requrl) Resp=Urllib2.urlopen (requrl) Html_data=resp.read () Soup= BS (Html_data,'Html.parser') Comment_div_lits= Soup.find_all ('Div', class_='Comment') forIteminchcomment_div_lits:ifItem.find_all ('P') [0].string is notNone:eachcommentstr+=item.find_all ('P') [0].stringreturnEachcommentstr.strip ()defMain ():#cycle to get the first 10 pages of a movie reviewCommentstr ="'nowplayingmovie_list=getnowplayingmovie_list () forIinchRange (10): Num= i + 1commentlist_temp= Getcommentsbyid (nowplayingmovie_list[0]['ID'], num) commentstr+=Commentlist_temp.strip ()#Print CommentsCleaned_comments = Re.sub ("[\s+\.\!\/_,$%^* (+\ "\ ')]+| [+--()? "", "<> ,...。? , [email protected]#¥%......&* ()]+","", Commentstr)Printcleaned_comments#using stuttering participle for Chinese word segmentationsegment=jieba.lcut (cleaned_comments) WORDS_DF=PD. DataFrame ({'segment': Segment}) #Remove the Stop wordStopwords=pd.read_csv ("D:\pycode\stopwords.txt", index_col=false,quoting=3,sep="\ t", names=['Stopword'], encoding='Utf-8')#quoting=3 all not quotedwords_df=words_df[~Words_df.segment.isin (Stopwords.stopword)]PrintWORDS_DF#Statistical FrequencyWords_stat=words_df.groupby (by=['segment'])['segment'].agg ({"Count": Numpy.size}) Words_stat=words_stat.reset_index (). Sort_values (by=["Count"],ascending=False)#display with a word cloudWordcloud=wordcloud (font_path="D:\pycode\simhei.ttf", background_color=" White", max_font_size=80) Word_frequence= {X[0]:x[1] forXinchWords_stat.head (1000). Values} word_frequence_list= [] forKeyinchword_frequence:temp=(Key,word_frequence[key]) word_frequence_list.append (temp) Wordcloud=wordcloud.fit_words (Dict (word_frequence_list)) plt.imshow (Wordcloud) Plt.axis ("if") plt.show ()#Main functionMain ()
Using Python crawler to display the word cloud for the "Dunkirk" film review