written in front of the words:
All of the data in the instance is downloaded from the GitHub and packaged for download.
The address is: Http://github.com/pydata/pydata-book there are certain to be explained:
I'm using Python2.7, the code in the book has some bugs, and I use my 2.7 version to tune in.
# Coding:utf-8 from pandas import Series, dataframe import pandas as PD import NumPy as NP df =dataframe ({' Key1 ': [' a], ' a
', ' B ', ' B ', ' A ', ' key2 ': [' one ', ' two ', ' one ', ' two ', ' one '], ' data1 ': Np.random.randn (5), ' Data2 ': Np.random.randn (5)}) DF grouped = df[' data1 '].groupby (df[' key1 ']) grouped grouped.mean () means = df[' data1 '].groupby [df[' Key1 '],df['] ). mean () states = Np.array ([' Ohio ', ' California ', ' California ', ' Ohio ', ' Ohio ']) years = Np.array (() means) ([ 2005,2005,2006,2005,2006]) df[' data1 '].groupby ([States,years]). Mean () df.groupby (' Key1 '). Mean () df.groupby ([' Key1 ', ' Key2 ']). Mean () df.groupby ([' Key1 ', ' Key2 ']). Size () for Name,group in Df.groupby (' Key1 '): Print name Print g Roup for (K1,K2), group in Df.groupby ([' Key1 ', ' Key2 ']): Print k1,k2 print Group pieces = Dict (List df.groupby (' Key1 ')) pieces[' B '] df.dtypes grouped = df.groupby (Df.dtypes,axis = 1) dict (list (grouped)) df.groupby (' Key1 ') [' data1 '] df.g Roupby (' Key1 ') [[' Data1 ']] df.groupby ([' Key1 ', ' Key2 '])[[' Data2 ']].mean () s_grouped = Df.groupby ([' Key1 ', ' Key2 ']) [' data2 '] s_grouped s_grouped.mean () people = Dataframe ( Np.random.randn (5,5), columns = [' A ', ' B ', ' C ', ' d ', ' e '],index = [' Joe ', ' Steve ', ' Wes ', ' Jim ', ' Travis ']] people.ix[2:3,[' B ', ' C ']] = np.nan people mapping = {' A ': ' Red ', ' B ': ' Red ', ' C ': ' Blue ', ' d ': ' Blue ', ' e ': ' Red ', ' f ': ' orange '} By_column = People.groupby (Mapping,axis = 1) by_column.sum () Map_series = series (mapping) Map_series people.groupby (Map_series, Axis = 1). Count () People.groupby (len). SUM () key_list = [' One ', ' one ', ' one ', ' two ', ' two '] people.groupby ([len,key_list]) . Min () columns = PD. Multiindex.from_arrays ([' Us ', ' us ', ' us ', ' JP ', ' jp '],[1,3,5,1,3]],names = [' Cty ', ' tenor ']) HIER_DF = Dataframe ( Np.random.randn (4,5), columns = columns) HIER_DF hier_df.groupby (level = ' cty ', Axis = 1). Count () Hier_df.groupby (level = ' Tenor ', Axis = 1). Count () hier_df.groupby (level = [' Cty ', ' tenor '],axis = 1). Count () DF grouped = df.groupby (' key1 ') groupe
d[' data1 '].quantile (0.9), Def peak_to_peak (arr): Return Arr.max ()-Arr.min () Grouped.agg (Peak_to_peak) grouped.describe () tips = Pd.read_csv (' D:\Source Code\pydata-boo K-master\ch08\\tips.csv ') tips[' tip_pct '] = tips[' tip ']/tips[' Total_bill '] tips.head () grouped = Tips.groupby ([' Sex ',
' Smoker '] grouped_pct = grouped[' tip_pct '] grouped_pct.agg (' mean ') grouped_pct.agg ([' mean ', ' std ', peak_to_peak]) Grouped_pct.agg ([' foo ', ' mean '), (' Bar ', np.std]) functions = [' count ', ' mean ', ' max '] result = grouped[' tip_pct ', '
Total_bill '].agg (functions) result result[' tip_pct '] ftuples = [(' Durchschnitt ', ' mean '), (' Abweichung ', Np.var)] grouped[' tip_pct ', ' Total_bill '].agg (ftuples) grouped.agg ({' Tip ': Np.max, ' size ': Sum}) Grouped.agg ({' Tip ': [' min ', '] Max ', ' mean ', ' std '], ' size ': Sum} ' tips.groupby ([' Sex ', ' smoker '],as_index=false). Mean () DF K1_means = Df.groupby (' Key1 '). mean (). Add_prefix (' Mean_ ') K1_means pd.merge (df,k1_means,left_on = ' Key1 ', Right_index = True) people = Dataframe (Np.random.randn (5,5), columns = [' A ', ' B ', ' C ', ' d ', ' e '],index = [' Joe ', ' Steve ', ' WeS ', ' Jim ', ' Travis '] people key = [' One ', ' two ', ' one ', ' two ', ' One '] people.groupby (key). Mean () People.groupby (key).
Transform (Np.mean) def demean (arr): Return Arr-arr.mean () demeaned = People.groupby (key). Transform (Demean) demeaned Demeaned.groupby (key). Mean () def top (df,n = 5,column = ' tip_pct '): Return Df.sort_index (by = column) [-N:] Top (tips,n = 6) tips.groupby (' smoker '). Apply (top) tips.groupby ([' Smoker ', ' "]"). Apply (Top,n = 1,column = ' Total_bill ') result = Tips.groupby (' smoker ') [' tip_pct '].describe () result Result.unstack (' smoker ') F = lambda x:x.describe () tips.groupby ('
Smoker ') [' tip_pct '].apply (f) tips.groupby (' smoker '). Apply (f) tips.groupby (' smoker ', Group_keys = False). Apply (top) frame = Dataframe ({' Data1 ': Np.random.randn (1000), ' data2 ': Np.random.randn (1000)}) Frame.head () factor = Pd.cut ( frame.data1,4) factor[:10] def get_stats (group): return {' min ': group.min (), ' Max ': Group.max (), ' Count ': Group.count (), ' Mean ': Group.mean ()} grouped = Frame.data2.groupby (factor) GrouPed.apply (get_stats) grouped.apply (get_stats). Unstack () grouping = Pd.qcut (frame.data1,10) grouping = Pd.qcut ( Frame.data1,10,labels = False) Grouping grouped = frame.data2.groupby (grouping) grouped.apply (get_stats). Unstack () df = Dataframe ({' Category ': [' a ', ' a ', ' a ', ' a ', ' a ', ' a ', ' B ', ' B ', ' B ', ' B '], ' data ': NP.RANDOM.RANDN (8), ' weigh TS ': Np.random.randn (8)}) DF grouped = df.groupby (' category ') Get_wavg = lambda g:np.average (g[' data '],weights=g[') Weights ']) grouped.apply (get_wavg) close_px = Pd.read_csv (' D:\Source code\pydata-book-master\ch09\stock_px.csv ', parse_dates=true,index_col=0) close_px close_px[-4:] rets = Close_px.pct_change (). Dropna () Spx_corr = lambda x: X.corrwith (x[' SPX ']) by_year = Rets.groupby (lambda x:x.year) by_year.apply (Spx_corr) by_year.apply (lambda g:g[' AAPL ') Corr (g[' MSFT ')) import Statsmodels.api as SM def regress (data,yvax,xvars): Y = Data[yvax] X = Data[xvars] x[ ' intercept '] = 1 result = Sm. OLS (y,x). Fit () return Result.paramS by_year.apply (regress, ' AAPL ', [' SPX ']) FEC = Pd.read_csv (' D:\Source code\pydata-book-master\ch09\p00000001-all.csv FEC fec.ix[123456] unique_cands = Fec.cand_nm.unique () unique_cands unique_cands[2] Parties = {' Bachmann, Michelle ': ' R Epublican ', ' Cain, Herman ': ' Republican ', ' Gingrich, Newt ': ' Republican ', ' Huntsman, Jon ': ' Republican ', ' Johnson, Gary Earl ': ' Republican ', ' McCotter, Thaddeus G ': ' Republican ', ' Obama, Barack ': ' Democrat ', ' Paul, Ron ': ' Republican ', ' Pawlenty, Timothy ': ' Republican ', ' Perry, Rick ': ' Republican ', ' Roemer, Charles E. ' Buddy ' III ': ' Republican ', ' Romney, Mitt ': ' Republican ', ' Santorum, Rick ': ' Republican '} fec.cand_nm[123456:123461] Fec.cand_nm[123456:123461].map ( Parties) fec[' party ' = fec.cand_nm.map (parties) fec[' Party '].value_counts () (Fec.contb_receipt_amt > 0). value_ Counts () FEC = Fec[fec.contb_receipt_amt >0] Fec_mrbo = Fec[fec.cand_nm.isin ([' Obama, Barack, ' Romney, Mitt ']] fec_ Mrbo fec.contbr_occupation.value_counts () [:] occ_mapping = {' INFORMAtion requested per best efforts ': ' Don't provided ', ' information requested ': ' not provided ', ' information requested (B EST efforts) ': ' Not provided ', ' C.E.O ': ' CEO '} f = lambda x:occ_mapping.get (x,x) fec.contbr_occupation = FEC.CONTBR_OCC Upation.map (f) emp_mapping = {' information requested per best efforts ': ' Not provided ', ' information requested ': ' N OT provided ', ' self ': ' self-employed ', ' self employed ': ' self-employed '} f = lambda x:emp_mapping.get (x,x) Fec.cont Br_employer = Fec.contbr_employer.map (f) by_occupation = fec.pivot_table (' Contb_receipt_amt ', rows = ' contbr_ Occupation ', cols = ' Party ', aggfunc = sum) by_occupation.head () over_2mm = by_occupation[by_occupation.sum (1) > 2000000] over_2mm over_2mm.plot (kind = ' Barh ') def get_top_amounts (group,key,n = 5): totals = Group.groupby (key) [' Cont B_receipt_amt '].sum () return Totals.order (ascending = False) [: n] grouped = fec_mrbo.groupby (' cand_nm ') grouped.apply (g Et_top_amounts, ' contbr_occupation ', n = 7), ' \ n '
Fec_mrbo.groupby ([' cand_nm ', ' contbr_occupation ']) [' Contb_receipt_amt '].sum () grouped.apply (get_top_amounts, ' Contbr_employer ', n = 10)