標籤:amp groupby vertica orm ram return 結束 localtime 意義
from pyecharts import Bar,Pieimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport time
df=pd.read_excel("all_data_meituan.xlsx")
df.drop(‘comment‘,axis=1).head(2)
df[‘avgPrice‘].value_counts()# 同一家店的均價應該為同一個數值,所以這列資料沒多大的意義
73 17400Name: avgPrice, dtype: int64
df[‘anonymous‘].value_counts()# 匿名評價與實名評價的比例大致在5:1左右
False 14402True 2998Name: anonymous, dtype: int64
def convertTime(x): y=time.localtime(x/1000) z=time.strftime("%Y-%m-%d %H:%M:%S",y) return zdf["commentTime"]=df["commentTime"].apply(convertTime)df["commentTime"].head()
0 2018-05-09 22:21:481 2018-06-01 19:41:312 2018-04-04 11:52:233 2018-05-01 17:12:224 2018-05-17 16:48:04Name: commentTime, dtype: object
# 在excel可以用篩選器直接看到這列中的資料含有缺失值,或者在拿到資料的時候,使用df.info() 查看每列的資料資訊情況df[‘dealEndtime‘].isna().value_counts()# 這列資料中含有177個缺失值,其餘完整
False 17223True 177Name: dealEndtime, dtype: int64
df[‘commentTime‘]=pd.to_datetime(df[‘commentTime‘])df1 = df.set_index(‘commentTime‘)df1.resample(‘D‘).size().sort_values(ascending=False).head(100)df2=df1.resample(‘M‘).size().to_period()df2=df2.reset_index()# df2.columns# from pyecharts import Barbar =Bar("按月統計",width=1000,height=800)bar.add("月統計表",df2[‘commentTime‘],df2[0],is_label_show=True, is_datazoom_show=True,is_toolbox_show=True,is_more_utils=True)bar
df[‘commentTime‘]=pd.to_datetime(df[‘commentTime‘])df[‘hour‘] = df[‘commentTime‘].dt.hourdf2= df.groupby([‘hour‘]).size()df2from pyecharts import Barbar =Bar("分時統計",width=1000,height=600)bar.add("分時計表",[‘{} h‘.format(i) for i in df2.index],df2.values,is_label_show=True, is_datazoom_show=True,is_toolbox_show=True,is_more_utils=True,is_random=True)bar
df[‘commentTime‘]=pd.to_datetime(df[‘commentTime‘])df[‘weekday‘] = df[‘commentTime‘].dt.weekdaydf2= df.groupby([‘weekday‘]).size()# 周末吃外賣的還是教平時多了一些from pyecharts import Barbar =Bar("周總計",width=750,height=400)weekday=["一","二","三","四","五","六","日"]bar.add("周總計",[‘周{}‘.format(i) for i in weekday],df2.values,is_label_show=True, is_datazoom_show=False,is_toolbox_show=True,is_more_utils=True,is_random=True)bar
# 處理資料前需要先處理缺失值# 訂單結束時間清洗df[‘dealEndtime‘].fillna(method=‘ffill‘).apply(lambda x:time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x))).head()
0 2018-06-30 14:00:001 2018-06-30 14:00:002 2018-06-30 14:00:003 2018-06-30 14:00:004 2018-06-30 14:00:00Name: dealEndtime, dtype: object
df[‘menu‘].dropna().astype(‘category‘).value_counts()
2人午晚餐 7640單人午晚餐 3920學生專享午晚自助 26384人午/晚自助 1581單人下午自助烤肉 6396人午/晚自助 507周一至周五自助烤肉/周六日及節假日自助烤肉2選1 209單人午/晚自助 67周一至周五自助烤肉,免費WiFi 22Name: menu, dtype: int64
df[‘readCnt‘].corr(df[‘star‘])# 評論閱讀書與客戶評價分數高低的相關性
0.05909293203205019
df_most=df[(df["menu"]=="2人午晚餐")][‘star‘].value_counts().reindex(range(10,60,10))
10 32920 53330 200240 270450 2072Name: star, dtype: int64
df[(df["menu"]=="單人午晚餐")][‘star‘].value_counts()
30 121540 120850 109320 29810 106Name: star, dtype: int64
# 學生專享午晚自助 df[(df["menu"]=="學生專享午晚自助")][‘star‘].value_counts()
40 95450 86330 52920 19110 101Name: star, dtype: int64
df[(df["menu"]=="4人午/晚自助")][‘star‘].value_counts()
50 53630 43240 41410 13120 68Name: star, dtype: int64
df[(df["menu"]=="單人下午自助烤肉")][‘star‘].value_counts()
30 20850 16940 14410 9820 20Name: star, dtype: int64
df[(df["menu"]=="6人午/晚自助")][‘star‘].value_counts()
50 24540 14230 11210 8Name: star, dtype: int64
#周一至周五自助烤肉/周六日及節假日自助烤肉2選1df[(df["menu"]=="周一至周五自助烤肉/周六日及節假日自助烤肉2選1")][‘star‘].value_counts()
50 8740 6630 4620 10Name: star, dtype: int64
df[(df["menu"]=="單人午/晚自助")][‘star‘].value_counts()
50 3040 2730 10Name: star, dtype: int64
df[(df["menu"]=="周一至周五自助烤肉,免費WiFi")][‘star‘].value_counts().reindex(range(10,51,10)).fillna(0)
10 0.020 0.030 0.040 0.050 22.0Name: star, dtype: float64
# df.groupby([‘menu‘,‘star‘]).size().to_excel("all_menu_star.xls")df.groupby([‘menu‘,‘star‘]).size()
menu star2人午晚餐 10 329 20 533 30 2002 40 2704 50 20724人午/晚自助 10 131 20 68 30 432 40 414 50 5366人午/晚自助 10 8 30 112 40 142 50 245單人下午自助烤肉 10 98 20 20 30 208 40 144 50 169單人午/晚自助 30 10 40 27 50 30單人午晚餐 10 106 20 298 30 1215 40 1208 50 1093周一至周五自助烤肉/周六日及節假日自助烤肉2選1 20 10 30 46 40 66 50 87周一至周五自助烤肉,免費WiFi 50 22學生專享午晚自助 10 101 20 191 30 529 40 954 50 863dtype: int64
df.groupby([‘star‘,‘menu‘,]).size()
star menu 10 2人午晚餐 329 4人午/晚自助 131 6人午/晚自助 8 單人下午自助烤肉 98 單人午晚餐 106 學生專享午晚自助 10120 2人午晚餐 533 4人午/晚自助 68 單人下午自助烤肉 20 單人午晚餐 298 周一至周五自助烤肉/周六日及節假日自助烤肉2選1 10 學生專享午晚自助 19130 2人午晚餐 2002 4人午/晚自助 432 6人午/晚自助 112 單人下午自助烤肉 208 單人午/晚自助 10 單人午晚餐 1215 周一至周五自助烤肉/周六日及節假日自助烤肉2選1 46 學生專享午晚自助 52940 2人午晚餐 2704 4人午/晚自助 414 6人午/晚自助 142 單人下午自助烤肉 144 單人午/晚自助 27 單人午晚餐 1208 周一至周五自助烤肉/周六日及節假日自助烤肉2選1 66 學生專享午晚自助 95450 2人午晚餐 2072 4人午/晚自助 536 6人午/晚自助 245 單人下午自助烤肉 169 單人午/晚自助 30 單人午晚餐 1093 周一至周五自助烤肉/周六日及節假日自助烤肉2選1 87 周一至周五自助烤肉,免費WiFi 22 學生專享午晚自助 863dtype: int64
df.groupby([‘star‘,‘menu‘,]).size()[50]
menu2人午晚餐 20724人午/晚自助 5366人午/晚自助 245單人下午自助烤肉 169單人午/晚自助 30單人午晚餐 1093周一至周五自助烤肉/周六日及節假日自助烤肉2選1 87周一至周五自助烤肉,免費WiFi 22學生專享午晚自助 863dtype: int64
# userId# 這家店鋪有好多回頭客,萬萬沒想到df[df[‘userId‘]!=0][‘userId‘].value_counts().head(40)
266045270 64152775497 6080372612 60129840082 60336387962 6034216474 60617772217 6082682689 54287219504 49884729389 45868838851 40409054441 4086939815 40776086712 4048597225 40111808598 40240199490 4083068123 40298504911 401042639014 40912472277 4098198819 401494880345 40152930400 40139581136 40404183587 40714781743 40292809386 4018111538 401097689674 40300905323 40232697160 40141718492 40879430090 40696143486 4013257519 40983797146 40911947863 40993057629 40494215297 40Name: userId, dtype: int64
df[df[‘userName‘]!="匿名使用者"][‘userName‘].value_counts().head(40)
xuruiss1026 64黑髮飄呀飄 60麼麼噠我是你聰叔 60jIx325233926 60siisgood 60vTF610712604 60始於初見的你 60yumengkou 54Daaaav 49梁子7543 45oev575457132 40oUI806055883 40joF498901567 40liE32679330 40張齊齊123 40VPA342570392 40kingd123 40Nqr695642404 40Mvo148723747 40ree177064067 40大遊 40_qq3sh1369887220 40bQl271583480 40凱蒂寶 40安然~軒 40FQe845913598 40清晨cxh98 40cBj31240225 40天蛟Wing 40oMz861346972 40熱帶魚7697 40Mqg827794346 40nXu534267448 40aYH197128794 40榴蓮餡月餅 40leeman666888 40迅行天下 40濱海之戀33 40pHO437742850 40SzX539077433 40Name: userName, dtype: int64
df.groupby([‘star‘,‘userLevel‘,]).size()
star userLevel10 0 187 1 139 2 164 3 193 4 80 5 1020 0 223 1 88 2 304 3 294 4 207 5 2130 0 1147 1 405 2 1057 3 1230 4 570 5 165 6 2040 0 870 1 432 2 1360 3 1751 4 1026 5 261 6 2550 0 698 1 386 2 1167 3 1670 4 802 5 318 6 130dtype: int64
df_level_star = df.groupby([‘userLevel‘,‘star‘]).size()attr = np.arange(10,60,10)from pyecharts import Barbar = Bar("使用者等級與評分",title_pos="center")df_0 = df_level_star[0].valuesdf_1 = df_level_star[1].valuesdf_2 = df_level_star[2].valuesdf_3 = df_level_star[3].valuesdf_4 = df_level_star[4].valuesdf_5 = df_level_star[5].values# df_6 = df_level_star[6].valuesdf_6 = df_level_star[6].reindex(attr).fillna(0).valuesbar.add("level 0",attr,df_0,is_label_show=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 1",attr,df_1,is_label_show=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 2",attr,df_2,is_label_show=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 3",attr,df_3,mark_line=["average"],mark_point=[‘max‘,‘min‘],is_label_show=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 4",attr,df_4,is_label_show=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 5",attr,df_5,is_label_show=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 6",attr,df_6,is_label_show=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar
<div id="5fcf9a4be1814dae9a66e63db26848a9" style="width:800px;height:400px;"></div>
bar = Bar("使用者等級與評分",title_pos="center",title_color="red")attr = np.arange(10,60,10)df_0 = df_level_star[0].valuesdf_1 = df_level_star[1].valuesdf_2 = df_level_star[2].valuesdf_3 = df_level_star[3].valuesdf_4 = df_level_star[4].valuesdf_5 = df_level_star[5].values# df_6 = df_level_star[6].valuesdf_6 = df_level_star[6].reindex(attr).fillna(0).valuesbar.add("level 0",attr,df_0,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 1",attr,df_1,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 2",attr,df_2,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 3",attr,df_3,is_stack=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 4",attr,df_4,is_stack=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 5",attr,df_5,is_stack=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar.add("level 6",attr,df_6,is_stack=True,legend_pos=‘right‘,legend_orient=‘vertical‘,label_text_size=12)bar
df[‘star‘].corr(df[‘userLevel‘])
0.14389808871897794
df_zan=df[‘zanCnt‘].value_counts()from pyecharts import Barbar=Bar("點贊統計")bar.add("點贊分布",df_zan.index[1:],df_zan.values[1:],is_label_show=True)bar
<div id="3123fe244a684d7e97c8c3d9f47aa715" style="width:800px;height:400px;"></div>
df.describe()
|
avgPrice |
dealEndtime |
did |
readCnt |
replyCnt |
reviewId |
star |
userId |
userLevel |
zanCnt |
hour |
weekday |
count |
17400.0 |
1.722300e+04 |
1.740000e+04 |
17400.000000 |
17400.000000 |
1.740000e+04 |
17400.000000 |
1.740000e+04 |
17400.000000 |
17400.000000 |
17400.000000 |
17400.000000 |
mean |
73.0 |
1.529633e+09 |
4.376319e+07 |
1622.936149 |
0.032759 |
1.443980e+09 |
37.691954 |
3.224900e+08 |
2.335230 |
0.096264 |
14.955460 |
3.152356 |
std |
0.0 |
5.730086e+06 |
5.749815e+06 |
4981.816447 |
0.260349 |
2.208396e+08 |
10.813002 |
3.914649e+08 |
1.470979 |
0.511591 |
5.046872 |
2.044944 |
min |
73.0 |
1.483078e+09 |
1.330754e+06 |
20.000000 |
0.000000 |
1.093178e+09 |
10.000000 |
0.000000e+00 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
25% |
73.0 |
1.530338e+09 |
4.432824e+07 |
162.000000 |
0.000000 |
1.197515e+09 |
30.000000 |
4.527015e+07 |
1.000000 |
0.000000 |
11.000000 |
1.000000 |
50% |
73.0 |
1.530338e+09 |
4.432824e+07 |
304.000000 |
0.000000 |
1.606347e+09 |
40.000000 |
1.527755e+08 |
3.000000 |
0.000000 |
15.000000 |
3.000000 |
75% |
73.0 |
1.530338e+09 |
4.432853e+07 |
751.000000 |
0.000000 |
1.646467e+09 |
50.000000 |
4.859086e+08 |
3.000000 |
0.000000 |
19.000000 |
5.000000 |
max |
73.0 |
1.530338e+09 |
4.597465e+07 |
77837.000000 |
4.000000 |
1.698204e+09 |
50.000000 |
1.771740e+09 |
6.000000 |
8.000000 |
23.000000 |
6.000000 |
df[‘userLevel‘].value_counts().reindex(range(7))
0 31251 14502 40523 51384 26855 7756 175Name: userLevel, dtype: int64
df_level=df[‘userLevel‘].value_counts().reindex(range(7))from pyecharts import Piepie=Pie("使用者等級分布",title_pos="center",width=900)pie.add("levels distribution",["level "+str(i) for i in range(7)],df_level.values,is_random=True,radidus=[30,45],legend_pos=‘left‘,rosetype=‘area‘,legend_orient=‘vertical‘,is_label_show=True,label_text_size=20)pie
基於pandas python的美團某商家的評論銷售資料分析(可視化續)