寫在前面的話:
執行個體中的所有資料都是在GitHub上下載的,打包下載即可。
地址是:http://github.com/pydata/pydata-book 還有一定要說明的:
我使用的是Python2.7,書中的代碼有一些有錯誤,我使用自己的2.7版本調通。
# coding: utf-8from pandas import Series, DataFrameimport pandas as pdimport numpy as npdf =DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'], 'data1':np.random.randn(5),'data2':np.random.randn(5)})dfgrouped = df['data1'].groupby(df['key1'])groupedgrouped.mean()means = df['data1'].groupby([df['key1'],df['key2']]).mean()meansmeans.unstack()states = np.array(['Ohio','California','California','Ohio','Ohio'])years = np.array([2005,2005,2006,2005,2006])df['data1'].groupby([states,years]).mean()df.groupby('key1').mean()df.groupby(['key1','key2']).mean()df.groupby(['key1','key2']).size()for name,group in df.groupby('key1'): print name print groupfor (k1,k2),group in df.groupby(['key1','key2']): print k1,k2 print grouppieces = dict(list(df.groupby('key1')))pieces['b']df.dtypesgrouped = df.groupby(df.dtypes,axis = 1)dict(list(grouped))df.groupby('key1')['data1']df.groupby('key1')[['data1']]df.groupby(['key1','key2'])[['data2']].mean()s_grouped = df.groupby(['key1','key2'])['data2']s_groupeds_grouped.mean()people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis'])people.ix[2:3,['b','c']] = np.nanpeoplemapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}by_column = people.groupby(mapping,axis = 1)by_column.sum()map_series = Series(mapping)map_seriespeople.groupby(map_series,axis = 1).count()people.groupby(len).sum()key_list = ['one','one','one','two','two']people.groupby([len,key_list]).min()columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names = ['cty','tenor'])hier_df = DataFrame(np.random.randn(4,5),columns = columns)hier_dfhier_df.groupby(level = 'cty',axis = 1).count()hier_df.groupby(level = 'tenor',axis = 1).count()hier_df.groupby(level = ['cty','tenor'],axis = 1).count()dfgrouped = df.groupby('key1')grouped['data1'].quantile(0.9),def peak_to_peak(arr): return arr.max() - arr.min()grouped.agg(peak_to_peak)grouped.describe()tips = pd.read_csv('D:\Source Code\pydata-book-master\ch08\\tips.csv')tips['tip_pct'] = tips['tip'] / tips['total_bill']tips.head()grouped = tips.groupby(['sex','smoker'])grouped_pct = grouped['tip_pct']grouped_pct.agg('mean')grouped_pct.agg(['mean','std',peak_to_peak])grouped_pct.agg([('foo','mean'),('bar',np.std)])functions = ['count','mean','max']result = grouped['tip_pct','total_bill'].agg(functions)resultresult['tip_pct']ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]grouped['tip_pct','total_bill'].agg(ftuples)grouped.agg({'tip':np.max,'size':sum})grouped.agg({'tip':['min','max','mean','std'],'size':sum})tips.groupby(['sex','smoker'],as_index=False).mean()dfk1_means = df.groupby('key1').mean().add_prefix('mean_')k1_meanspd.merge(df,k1_means,left_on = 'key1',right_index = True)people = DataFrame(np.random.randn(5,5),columns = ['a','b','c','d','e'],index = ['Joe','Steve','Wes','Jim','Travis'])peoplekey = ['one','two','one','two','one']people.groupby(key).mean()people.groupby(key).transform(np.mean)def demean(arr): return arr - arr.mean()demeaned = people.groupby(key).transform(demean)demeaneddemeaned.groupby(key).mean()def top(df,n = 5,column = 'tip_pct'): return df.sort_index(by = column)[-n:]top(tips,n = 6)tips.groupby('smoker').apply(top)tips.groupby(['smoker','day']).apply(top,n = 1,column = 'total_bill')result = tips.groupby('smoker')['tip_pct'].describe()resultresult.unstack('smoker')f = lambda x : x.describe()tips.groupby('smoker')['tip_pct'].apply(f)tips.groupby('smoker').apply(f)tips.groupby('smoker',group_keys = False).apply(top)frame = DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})frame.head()factor = pd.cut(frame.data1,4)factor[:10]def get_stats(group): return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}grouped = frame.data2.groupby(factor)grouped.apply(get_stats)grouped.apply(get_stats).unstack()grouping = pd.qcut(frame.data1,10)grouping = pd.qcut(frame.data1,10,labels = False)groupinggrouped = frame.data2.groupby(grouping)grouped.apply(get_stats).unstack()df = DataFrame({'category':['a','a','a','a','b','b','b','b'], 'data':np.random.randn(8), 'weights':np.random.randn(8)})dfgrouped = df.groupby('category')get_wavg = lambda g:np.average(g['data'],weights=g['weights'])grouped.apply(get_wavg)close_px = pd.read_csv('D:\Source Code\pydata-book-master\ch09\stock_px.csv',parse_dates=True,index_col=0)close_pxclose_px[-4:]rets = close_px.pct_change().dropna()spx_corr = lambda x:x.corrwith(x['SPX'])by_year = rets.groupby(lambda x:x.year)by_year.apply(spx_corr)by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))import statsmodels.api as smdef regress(data,yvax,xvars): Y = data[yvax] X = data[xvars] X['intercept'] = 1 result = sm.OLS(Y,X).fit() return result.paramsby_year.apply(regress,'AAPL',['SPX'])fec = pd.read_csv('D:\Source Code\pydata-book-master\ch09\P00000001-ALL.csv')fecfec.ix[123456]unique_cands = fec.cand_nm.unique()unique_candsunique_cands[2]parties = {'Bachmann, Michelle':'Republican','Cain, Herman':'Republican','Gingrich, Newt':'Republican','Huntsman, Jon':'Republican','Johnson, Gary Earl':'Republican','McCotter, Thaddeus G':'Republican','Obama, Barack':'Democrat','Paul, Ron':'Republican','Pawlenty, Timothy':'Republican','Perry, Rick':'Republican',"Roemer, Charles E. 'Buddy' III":'Republican','Romney, Mitt':'Republican','Santorum, Rick':'Republican'}fec.cand_nm[123456:123461]fec.cand_nm[123456:123461].map(parties)fec['party'] = fec.cand_nm.map(parties)fec['party'].value_counts()(fec.contb_receipt_amt > 0).value_counts()fec = fec[fec.contb_receipt_amt >0]fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]fec_mrbofec.contbr_occupation.value_counts()[:10]occ_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED', 'INFORMATION REQUESTED':'NOT PROVIDED', 'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED', 'C.E.O':'CEO'}f = lambda x:occ_mapping.get(x,x)fec.contbr_occupation = fec.contbr_occupation.map(f)emp_mapping = { 'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED', 'INFORMATION REQUESTED':'NOT PROVIDED', 'SELF':'SELF-EMPLOYED', 'SELF EMPLOYED':'SELF-EMPLOYED'}f = lambda x:emp_mapping.get(x,x)fec.contbr_employer = fec.contbr_employer.map(f)by_occupation = fec.pivot_table('contb_receipt_amt',rows = 'contbr_occupation',cols = 'party',aggfunc = sum)by_occupation.head()over_2mm = by_occupation[by_occupation.sum(1) > 2000000]over_2mmover_2mm.plot(kind = 'barh')def get_top_amounts(group,key,n = 5): totals = group.groupby(key)['contb_receipt_amt'].sum() return totals.order(ascending = False)[:n]grouped = fec_mrbo.groupby('cand_nm')grouped.apply(get_top_amounts,'contbr_occupation',n = 7),'\n'fec_mrbo.groupby(['cand_nm','contbr_occupation'])['contb_receipt_amt'].sum()grouped.apply(get_top_amounts,'contbr_employer',n = 10)