from pyecharts import Bar,Pieimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport time
df=pd.read_excel("all_data_meituan.xlsx")
df.drop(‘comment‘,axis=1).head(2)
df[‘avgPrice‘].value_counts()# 同一家店的均价应该为同一个数值,所以这列数据没多大的意义
73 17400Name: avgPrice, dtype: int64
df[‘anonymous‘].value_counts()# 匿名评价与实名评价的比例大致在5:1左右
False 14402True 2998Name: anonymous, dtype: int64
def convertTime(x): y=time.localtime(x/1000) z=time.strftime("%Y-%m-%d %H:%M:%S",y) return zdf["commentTime"]=df["commentTime"].apply(convertTime)df["commentTime"].head()
0 2018-05-09 22:21:481 2018-06-01 19:41:312 2018-04-04 11:52:233 2018-05-01 17:12:224 2018-05-17 16:48:04Name: commentTime, dtype: object
# 在excel可以用筛选器直接看到这列中的数据含有缺失值,或者在拿到数据的时候,使用df.info() 查看每列的数据信息情况df[‘dealEndtime‘].isna().value_counts()# 这列数据中含有177个缺失值,其余完整
False 17223True 177Name: dealEndtime, dtype: int64
df[‘commentTime‘]=pd.to_datetime(df[‘commentTime‘])df1 = df.set_index(‘commentTime‘)df1.resample(‘D‘).size().sort_values(ascending=False).head(100)df2=df1.resample(‘M‘).size().to_period()df2=df2.reset_index()# df2.columns# from pyecharts import Barbar =Bar("按月统计",width=1000,height=800)bar.add("月统计表",df2[‘commentTime‘],df2[0],is_label_show=True, is_datazoom_show=True,is_toolbox_show=True,is_more_utils=True)bar
df[‘commentTime‘]=pd.to_datetime(df[‘commentTime‘])df[‘hour‘] = df[‘commentTime‘].dt.hourdf2= df.groupby([‘hour‘]).size()df2from pyecharts import Barbar =Bar("分时统计",width=1000,height=600)bar.add("分时计表",[‘{} h‘.format(i) for i in df2.index],df2.values,is_label_show=True, is_datazoom_show=True,is_toolbox_show=True,is_more_utils=True,is_random=True)bar
df[‘commentTime‘]=pd.to_datetime(df[‘commentTime‘])df[‘weekday‘] = df[‘commentTime‘].dt.weekdaydf2= df.groupby([‘weekday‘]).size()# 周末吃外卖的还是教平时多了一些from pyecharts import Barbar =Bar("周总计",width=750,height=400)weekday=["一","二","三","四","五","六","日"]bar.add("周总计",[‘周{}‘.format(i) for i in weekday],df2.values,is_label_show=True, is_datazoom_show=False,is_toolbox_show=True,is_more_utils=True,is_random=True)bar
# 处理数据前需要先处理缺失值# 订单结束时间清洗df[‘dealEndtime‘].fillna(method=‘ffill‘).apply(lambda x:time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x))).head()
0 2018-06-30 14:00:001 2018-06-30 14:00:002 2018-06-30 14:00:003 2018-06-30 14:00:004 2018-06-30 14:00:00Name: dealEndtime, dtype: object
df[‘menu‘].dropna().astype(‘category‘).value_counts()
2人午晚餐 7640单人午晚餐 3920学生专享午晚自助 26384人午/晚自助 1581单人下午自助烤肉 6396人午/晚自助 507周一至周五自助烤肉/周六日及节假日自助烤肉2选1 209单人午/晚自助 67周一至周五自助烤肉,免费WiFi 22Name: menu, dtype: int64
df[‘readCnt‘].corr(df[‘star‘])# 评论阅读书与客户评价分数高低的相关性
0.05909293203205019
df_most=df[(df["menu"]=="2人午晚餐")][‘star‘].value_counts().reindex(range(10,60,10))
10 32920 53330 200240 270450 2072Name: star, dtype: int64
df[(df["menu"]=="单人午晚餐")][‘star‘].value_counts()
30 121540 120850 109320 29810 106Name: star, dtype: int64
# 学生专享午晚自助 df[(df["menu"]=="学生专享午晚自助")][‘star‘].value_counts()
40 95450 86330 52920 19110 101Name: star, dtype: int64
df[(df["menu"]=="4人午/晚自助")][‘star‘].value_counts()
50 53630 43240 41410 13120 68Name: star, dtype: int64
df[(df["menu"]=="单人下午自助烤肉")][‘star‘].value_counts()
30 20850 16940 14410 9820 20Name: star, dtype: int64
df[(df["menu"]=="6人午/晚自助")][‘star‘].value_counts()
50 24540 14230 11210 8Name: star, dtype: int64
#周一至周五自助烤肉/周六日及节假日自助烤肉2选1df[(df["menu"]=="周一至周五自助烤肉/周六日及节假日自助烤肉2选1")][‘star‘].value_counts()
50 8740 6630 4620 10Name: star, dtype: int64
df[(df["menu"]=="单人午/晚自助")][‘star‘].value_counts()
50 3040 2730 10Name: star, dtype: int64
df[(df["menu"]=="周一至周五自助烤肉,免费WiFi")][‘star‘].value_counts().reindex(range(10,51,10)).fillna(0)
10 0.020 0.030 0.040 0.050 22.0Name: star, dtype: float64
# df.groupby([‘menu‘,‘star‘]).size().to_excel("all_menu_star.xls")df.groupby([‘menu‘,‘star‘]).size()
Menu Star2 Lunch and dinner 10 329 20 533 30 2002 40 2704 50 20724 lunch/night Buffet 10 131 20 68 30 432 40 414 50 5366 Lunch/night Buffet 10 8 30 112 40 142 50 245 single afternoon Buffet BBQ 10 98 20 20 30 208 40 144 50 169 Single lunch/night Buffet 30 10 40 27 50 30 single lunch and dinner 10 106 20 298 30 1215 40 1208 50 1093 Monday to Friday BBQ Buffet/week 6th and holidays Buffet BBQ 2 Select 1 20 10 30 46 40 66 50 87 weeks to Friday Buffet Yakiniku/BBQ, free WiFi 50 22 Student Exclusive Lunch Buffet 10 101 20 191 30 529 954 863dtype:int64
df.groupby ([' Star ', ' Menu ',]). Size ()
Star Menu 10 2 person lunch and Dinner 329 4 lunch/night Buffet 131 6 person lunch/night Buffet 8 Single Afternoon buffet Yakiniku/BBQ 98 single lunch and dinner 106 student exclusive lunch and evening buffet 10120 2 person Lunch dinner 533 4 lunch/night Buffet 68 single Afternoon Buffet BBQ 20 single lunch and dinner 298 weeks to Friday BBQ Buffet/week 6th and Holidays Buffet BBQ 2 1 10 exclusive lunch buffet 19130 Lunch/Dinner for 2 persons 2002 4 person lunch/night Buffet 432 6 person lunch/night Buffet 112 single afternoon BBQ Buffet 208 Single lunch/night Buffet 10 single lunch dinner 1215 weeks to Friday Buffet BBQ/Saturday Day and Holidays Buffet BBQ 2 Choice 1 46 students enjoy lunch buffet 52940 2 Lunch dinner 2704 4 person lunch/night Buffet 414 6 person lunch/night Buffet 142 single afternoon buffet BBQ 144 single lunch/night Buffet 27 Single Lunch and dinner 1208 weeks to Friday BBQ Buffet/week 6th and holidays Buffet BBQ 2 pick 1 66 students enjoy lunch buffet 95450 2 people lunch Dinner 2072 4 person lunch/night Buffet 536 6 person lunch/night Buffet 245 single afternoon Buffet BBQ 1 69 single lunch/night Buffet 30 single lunch dinner 1093 weeks to Friday Buffet BBQ/week 6th and holidays Buffet BBQ 2 Select 1 87 Monday to Friday buffet Yakiniku/BBQ, free WiFi 22 exclusive lunch and dinner buffet 863dtype:int64
df.groupby([‘star‘,‘menu‘,]).size()[50]
menu2人午晚餐 20724人午/晚自助 5366人午/晚自助 245单人下午自助烤肉 169单人午/晚自助 30单人午晚餐 1093周一至周五自助烤肉/周六日及节假日自助烤肉2选1 87周一至周五自助烤肉,免费WiFi 22学生专享午晚自助 863dtype: int64
# userId# 这家店铺有好多回头客,万万没想到df[df[‘userId‘]!=0][‘userId‘].value_counts().head(40)
266045270 64152775497 6080372612 60129840082 60336387962 6034216474 60617772217 6082682689 54287219504 49884729389 45868838851 40409054441 4086939815 40776086712 4048597225 40111808598 40240199490 4083068123 40298504911 401042639014 40912472277 4098198819 401494880345 40152930400 40139581136 40404183587 40714781743 40292809386 4018111538 401097689674 40300905323 40232697160 40141718492 40879430090 40696143486 4013257519 40983797146 40911947863 40993057629 40494215297 40Name: userId, dtype: int64
df[df[‘userName‘]!="匿名用户"][‘userName‘].value_counts().head(40)
xuruiss1026 64 black hair Float 60 I'm your uncle 60jix325233926 60siisgood 60v TF610712604 60 starts at the first sight of you 60yumengkou 54DAAAAV 49 Liangzi 7543 45oev575457132 40oui806055883 40jof498901567 40lie32679330 40 Zhang Qi qi 123 40vpa342570392 40kin gd123 40nqr695642404 40mvo148723747 40ree177064067 40 Grand Tour 40_qq3sh136988722 0 40bql271583480 40 Katy Bao 40 Enron ~ Xuan 40fqe845913598 40 Morning cxh98 40cbj31 240225 40 days Jiao Wing 40omz861346972 40 tropical Fish 7697 40mqg827794346 40nxu534267448 40ayh197128794 40 Durian filling mooncake 40leeman666888 40 Xun World 40 Marina Love 40pho4377 42850 40szx539077433 40name:username, Dtype:int64
df.groupby([‘star‘,‘userLevel‘,]).size()
star userLevel10 0 187 1 139 2 164 3 193 4 80 5 1020 0 223 1 88 2 304 3 294 4 207 5 2130 0 1147 1 405 2 1057 3 1230 4 570 5 165 6 2040 0 870 1 432 2 1360 3 1751 4 1026 5 261 6 2550 0 698 1 386 2 1167 3 1670 4 802 5 318 6 130dtype: int64
Df_level_star = Df.groupby ([' UserLevel ', ' Star ']). Size () attr = Np.arange (10,60,10) from pyecharts import Barbar = Bar (" User ratings and ratings ", title_pos=" center ") df_0 = Df_level_star[0].valuesdf_1 = Df_level_star[1].valuesdf_2 = df_level_star[2]. Valuesdf_3 = Df_level_star[3].valuesdf_4 = Df_level_star[4].valuesdf_5 = df_level_star[5].values# Df_6 = df_level_star[ 6].valuesdf_6 = Df_level_star[6].reindex (attr). Fillna (0). Valuesbar.add ("Level 0", Attr,df_0,is_label_show=true, Legend_pos= ' right ', legend_orient= ' vertical ', label_text_size=12] Bar.add ("Level 1", Attr,df_1,is_label_show=true, Legend_pos= ' right ', legend_orient= ' vertical ', label_text_size=12] Bar.add ("Level 2", Attr,df_2,is_label_show=true, Legend_pos= ' right ', legend_orient= ' vertical ', label_text_size=12] Bar.add ("Level 3", attr,df_3,mark_line=["average" ],mark_point=[' Max ', ' min '],is_label_show=true,legend_pos= ' right ', legend_orient= ' vertical ', label_text_size=12) Bar.add ("Level 4", attr,df_4,is_label_show=true,legend_pos= ' right ', legend_orient= ' vertical ', label_text_size=12) Bar.add ("Level 5", attr,df_5,is_label_show=true,legend_pos= ' right ', legend_orient= ' vertical ', label_text_ size=12) Bar.add ("Level 6", attr,df_6,is_label_show=true,legend_pos= ' right ', legend_orient= ' vertical ', label_text_ SIZE=12) Bar
<div id= "5fcf9a4be1814dae9a66e63db26848a9" style= "width:800px;height:400px;" ></div>
Bar = Bar ("User level and rating", title_pos= "center", title_color= "red") attr = Np.arange (10,60,10) df_0 = df_level_star[0].valuesdf_ 1 = df_level_star[1].valuesdf_2 = Df_level_star[2].valuesdf_3 = Df_level_star[3].valuesdf_4 = Df_level_star[4]. Valuesdf_5 = df_level_star[5].values# Df_6 = Df_level_star[6].valuesdf_6 = Df_level_star[6].reindex (attr). Fillna (0). Valuesbar.add ("Level 0", attr,df_0,legend_pos= ' right ', legend_orient= ' vertical ', label_text_size=12) Bar.add ("Level 1 ", attr,df_1,legend_pos= ' right ', legend_orient= ' vertical ', label_text_size=12) Bar.add (" Level 2 ", Attr,df_2,legend_ Pos= ' right ', legend_orient= ' vertical ', label_text_size=12] Bar.add ("Level 3", attr,df_3,is_stack=true,legend_pos= ' Right ', legend_orient= ' vertical ', label_text_size=12) Bar.add ("Level 4", attr,df_4,is_stack=true,legend_pos= ' right ' , legend_orient= ' vertical ', label_text_size=12) Bar.add ("Level 5", attr,df_5,is_stack=true,legend_pos= ' right ', Legend_orient= ' vertical ', label_text_size=12) Bar.add ("Level 6", attr,df_6,is_stack=true,legend_pos= ' right ', leGend_orient= ' vertical ', label_text_size=12) bar
df[‘star‘].corr(df[‘userLevel‘])
0.14389808871897794
df_zan=df[‘zanCnt‘].value_counts()from pyecharts import Barbar=Bar("点赞统计")bar.add("点赞分布",df_zan.index[1:],df_zan.values[1:],is_label_show=True)bar
<div id="3123fe244a684d7e97c8c3d9f47aa715" style="width:800px;height:400px;"></div>
df.describe()
|
Avgprice |
Dealendtime |
| did
readcnt |
replycnt |
Reviewid |
Star |
userId |
UserLevel |
zancnt |
Hour |
Weekday |
Count |
17400.0 |
1.722300e+04 |
1.740000e+04 |
17400.000000 |
17400.000000 |
1.740000e+04 |
17400.000000 |
1.740000e+04 |
17400.000000 |
17400.000000 |
17400.000000 |
17400.000000 |
Mean |
73.0 |
1.529633e+09 |
4.376319e+07 |
1622.936149 |
0.032759 |
1.443980e+09 |
37.691954 |
3.224900e+08 |
2.335230 |
0.096264 |
14.955460 |
3.152356 |
Std |
0.0 |
5.730086e+06 |
5.749815e+06 |
4981.816447 |
0.260349 |
2.208396e+08 |
10.813002 |
3.914649e+08 |
1.470979 |
0.511591 |
5.046872 |
2.044944 |
Min |
73.0 |
1.483078e+09 |
1.330754e+06 |
20.000000 |
0.000000 |
1.093178e+09 |
10.000000 |
0.000000e+00 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
25% |
73.0 |
1.530338e+09 |
4.432824e+07 |
162.000000 |
0.000000 |
1.197515e+09 |
30.000000 |
4.527015e+07 |
1.000000 |
0.000000 |
11.000000 |
1.000000 |
50% |
73.0 |
1.530338e+09 |
4.432824e+07 |
304.000000 |
0.000000 |
1.606347e+09 |
40.000000 |
1.527755e+08 |
3.000000 |
0.000000 |
15.000000 |
3.000000 |
75% |
73.0 |
1.530338e+09 |
4.432853e+07 |
751.000000 |
0.000000 |
1.646467e+09 |
50.000000 |
4.859086e+08 |
3.000000 |
0.000000 |
19.000000 |
5.000000 |
Max |
73.0 |
1.530338e+09 |
4.597465e+07 |
77837.000000 |
4.000000 |
1.698204e+09 |
50.000000 |
1.771740e+09 |
6.000000 |
8.000000 |
23.000000 |
6.000000 |
df[‘userLevel‘].value_counts().reindex(range(7))
0 31251 14502 40523 51384 26855 7756 175Name: userLevel, dtype: int64
df_level=df[‘userLevel‘].value_counts().reindex(range(7))from pyecharts import Piepie=Pie("用户等级分布",title_pos="center",width=900)pie.add("levels distribution",["level "+str(i) for i in range(7)],df_level.values,is_random=True,radidus=[30,45],legend_pos=‘left‘,rosetype=‘area‘,legend_orient=‘vertical‘,is_label_show=True,label_text_size=20)pie
Analysis of sales data based on pandas Python's Business reviews (visual continuation)