#-*-Coding:utf-8-*-
# The Nineth chapter of Python for data analysis
# Data aggregation and grouping operations
Import Pandas as PD
Import NumPy as NP
Import time
# Group operation Process, Split-apply-combine
# Split App Merge
Start = Time.time ()
Np.random.seed (10)
# 1, GroupBy technology
# 1.1, citations
DF = PD. DataFrame ({
' Key1 ': [' A ', ' B ', ' A ', ' B ', ' a '],
' Key2 ': [' one ', ' one ', ' one ', ' one ', ' one ',
' Data1 ': Np.random.randint (1, 10, 5),
' Data2 ': Np.random.randn (5)
})
Print (DF)
Print (' \ n ')
grouped = df[' data1 '].groupby (df[' Key1 ']) # split, group data1 columns by key1 column
res = Grouped.mean () # Apply and combine, averaging each group of data and then synthesizing a dataframe or Series
Print (RES)
Print (' \ n ')
# above the data[' Key1 ' (that is, the object passed in GroupBy) is called the Grouping key, the grouping key can be multiple columns
Print (df[' data1 '].groupby ([df[' Key1 '], df[' Key2 ']). mean ()) # Results for a hierarchical indexed series
Print (' \ n ')
# Print (df[' data1 '].groupby ([' Key1 ', ' Key2 ']) # equivalent to the previous line, you can use the columns name instead of series as the grouping key
Print (Df.groupby (' key1 '). mean ()) # Aggregation (split+apply+combine) operates on numeric columns (data1, data2) only, and non-numeric columns (KEY2) are filtered
Print (' \ n ')
Print (Df.groupby (' Key1 '). Size ()) # Number of elements in each group
Print (' \ n ')
# 1.2. Iterate through the groupings
# GroupBy objects support iterations like List, dictionary
For name, the group in Df.groupby (' Key1 '):
Print (name)
Print (group) # still retains non-numeric columns after split
# Multiple keys are represented by tuples (tuple)
Print (' \ n ')
For (Key1, Key2), group in Df.groupby ([' Key1 ', ' Key2 ']):
Print (Key1)
Print (Key2)
Print (group)
# GroupBy is grouped by default on axis=0 to display the specified Axis=1
# can be understood as grouping of samples and grouping of features
# 1.3. Select one or a group of columns
# Essence: Index of the GroupBy object
# 1.1 The aggregation of partial columns is written as df[' Data1 '].groupby (' Key1 '), which can also be obtained by indexing the Dataframe object of the entire GroupBy
Print (' \ n ')
Print (Df.groupby (' key1 ') [' Data1 '].mean ())
Print (' \ n ')
# 1.4. Group BY Dictionary or series
DF = PD. DataFrame (Np.random.randn (5, 5), index=[' A ', ' B ', ' C ', ' d ', ' a '], columns=[' zoo ', ' zip ', ' zero ', ' zz ', ' zzz '])
mapping = {' A ': ' Red ', ' B ': ' Green ', ' C ': ' Red ', ' d ': ' Green ', ' e ': ' Pink '}
Print (df.groupby (mapping). mean ())
Print (' \ n ')
# 1.5, grouping by functions
Print (Df.groupby (len, Axis=1). SUM ()) # The string length is obtained for each column name on the Axis=1, and the length value is used as the grouping basis
Print (' \ n ')
Note that the grouping key for incoming groupby can be a mix of multiple types
key = [' One ', ' one ', ' one ', ' one ', ' one ']
Print (Df.groupby ([Len, Key], Axis=1). SUM ())
Print (' \ n ')
# 1.6, grouping by index level
# when index or columns are hierarchical indexes, you can specify which layer of indexes to group in GroupBy
columns = PD. Multiindex.from_arrays ([[' A ', ' a ', ' a ', ' B ', ' B '], [1, 2, 3, 4, 5]], names=[' level1 ', ' Level2 '])
DF = PD. DataFrame (Np.random.randn (5, 5), Columns=columns)
Print (Df.groupby (level= ' Level1 ', Axis=1). SUM ()) # level keyword to specify the hierarchy of indexes that need to be grouped
Print ('-----------------------------------↑section1 ')
# 2, Data aggregation
# 2.1, citations
# GroupBy Method implements split (group), with. SUM (),. mean () and other methods to implement the apply and combine, that is, aggregation
# Specifically, the aggregation method can be a series of various methods, the specific implementation process is
# step1--to Slice (group) the series by GroupBy--split
# step2--applies various aggregation functions, the various methods of the series mentioned above, to operate on individual slices--apply
# step3--to assemble the results of each slice--combine
# In addition to the various methods available in the series, you can define your own aggregate functions and apply them to groupby objects, passing through agg or aggregate methods.
Def PK2PK (GB):
Return Gb.max ()-Gb.min ()
Pk2pk_lambda = Lambda Gb:gb.max ()-Gb.min ()
DF = PD. DataFrame ({
' Key1 ': [' A ', ' B ', ' A ', ' B ', ' a '],
' Key2 ': [' one ', ' one ', ' one ', ' one ', ' one ',
' Data1 ': Np.random.randint (1, 10, 5),
' Data2 ': Np.random.randn (5)
})
Print (DF)
Print (Df.groupby (' Key1 '). Agg (PK2PK)) # Common functions
Print (Df.groupby (' Key1 '). Aggregate (PK2PK_LAMBDA)) # lambda function, agg and aggregat equivalents
Print (' \ n ')
# 2.2, column-oriented multi-function applications
# 2.2 and 2.3 section describes advanced aggregation features, for example, with a data set for tipping.
data = Pd.read_csv ('./data_set/tips.csv ')
# Add a list of "tips as a percentage of total"
data[' tip_p ' = data[' tip ']/data[' Total_bill ']
Print (Data.head ())
Print (' \ n ')
# Advanced Aggregation: Use different aggregate functions for different columns, or use multiple aggregate functions at once
grouped = Data.groupby ([' Smoker ', ' Day ']) # GroupBy Object
grouped_p = grouped[' tip_p '] # GroupBy a slice of object
Print (Grouped_p.agg (' mean ')) # Use mean method for slices
Print (' \ n ')
Note: Agg inside the preset function is passed in string form, passing in the custom function in the form of the function name
# Passing in a set of functions, a columns is formed with the function name as the column name
Print (Grouped_p.agg ([' mean ', ' std ', pk2pk, Pk2pk_lambda])
Print (' \ n ')
# The default function name of the lambda function is <lambda> there is no identification, it needs another way to differentiate, that is, the custom function name
# Custom column name, passed in as a tuple (name,function), where name is a custom name, function is the function name
Print (Grouped_p.agg (' Mean ', ' Mean '), (' Std ', ' std '), (' Peak2peak ', pk2pk), (' Peak2peak_2 ', Pk2pk_lambda)])
Print (' \ n ')
# More generally, you can use multiple aggregate functions for multiple columns of dataframe, at which point the aggregation result will be a hierarchical index dataframe
# This is equivalent to aggregating the columns first and then Concat (Axis=1) together
Print (grouped[' tip ', ' tip_p '].agg ([' mean ', ' std ', pk2pk, Pk2pk_lambda]))
Print (' \ n ')
# If you use different aggregation functions for different columns, pass a dictionary to the Agg method that maps from the column name to the function name
Print (grouped[' tip ', ' tip_p '].agg ({' Tip ': ' mean ', ' tip_p ': ' std '})
Print (")
Print (grouped[' tip ', ' tip_p '].agg ({' Tip ': [' mean ', ' std ', pk2pk], ' tip_p ': [' Sum ', Pk2pk_lambda]})
Print (' \ n ')
# 2.3, return aggregated data in "No index" form
# By default, the grouping key becomes the index of the result, which can be returned without index by the As_index keyword of the GroupBy function, and the grouping key turns into a column (Series) that aggregates the result
Print (Data.groupby ([' Smoker '], as_index=false). Mean ())
Print ('---------------------------------------↑section2 ')
# 3, group-level operations and conversions
# In addition to the aggregations mentioned above (various aggregate functions), the grouping operation can also be implemented by transform and apply.
# Transform does not change the index of the original Dataframe (or series), broadcasting the result of the grouping operation to the individual elements of each grouping, forming one or more new columns
Print (data)
Print (' \ n ')
Print (Data.groupby (' Day '). Transform (Np.mean)) # can be stitched together with concat and the original dataframe, Axis = 1
# 3.1, Apply: General ' split-apply-merge '
# apply can pass in any of the functions of the processing sequence, the returned result is entirely determined by the function passed in
sort = lambda df, column, N:df.sort_values (by=column) [-N:]
Print (Data.groupby (' smoker '). Apply (sort, column= ' tip ', n=10)) # returns the top 10 rows by tip from large to small
Print (")
# in the above example, the group key and the original index constitute a hierarchical index, but in fact the group key information is already included in the original Dataframe, you can set the Group_keys keyword to false to disable grouping keys when grouping
Print (Data.groupby (' Smoker ', group_keys=false). Apply (sort, column= ' tip ', n=10))
Print (' \ n ')
# 3.2, Division and bucket analysis
# The content of this section is to group the results of Qcut and cut into the GroupBy function implementation by interval
DF = PD. DataFrame ({
' Data1 ': Np.random.randn (100),
' Data2 ': Np.random.randn (100)})
Cut_data1 = Pd.cut (df[' data1 '], 5) # equal interval length cut into 5 segments
# data2 are grouped according to data1 segmentation results, and the number, average, standard deviation, maximum and minimum values of each group are counted.
Print (df[' data2 '].groupby (cut_data1). Apply (
Lambda GP: {' count ': Gp.count (), ' Max ': Gp.max (), ' min ': gp.min (), ' STD ': gp.std (), ' mean ': Gp.mean ()}). Unstack ())
Print (")
# Qcut is also the same, the bucket from the equal length into the number of equal
Print (df[' data2 '].groupby (pd.qcut (df[' data1 '), 5). Apply (
Lambda GP: {' count ': Gp.count (), ' Max ': Gp.max (), ' min ': gp.min (), ' STD ': gp.std (), ' mean ': Gp.mean ()}). Unstack ())
Print (' \ n ')
# 3.3, Example: Populating missing values with grouping-specific values
# In fact, the first group, then each set of apply Fill missing value function Fillna
DF = PD. DataFrame ({
' Key1 ': [' A ', ' B ', ' A ', ' B ', ' a '],
' Key2 ': [' one ', ' one ', ' one ', ' one ', ' one ',
' Data1 ': Np.random.randint (1, 10, 5),
' Data2 ': Np.random.randn (5)
})
Df.ix[2:3, ' data2 ']=np.nan
Print (DF)
Print (")
DF = df.groupby (' Key1 ', group_keys=false). Apply (lambda Gp:gp.fillna (Gp.mean ()))
Print (DF)
Print ('-------------------------------------↑section3 ')
# The rest of the examples are application examples of apply, passing in different functions, including random sampling, correlation coefficients, linear regression, etc.
# 4, Pivot table and crosstab
# pivot table and crosstab can all be implemented by GROUPBY and can be considered as shortcuts to GroupBy
# 4.1, Pivot table (pivot)
# Axis=0 group in day and time, smoker for Axis=1 direction, Pivot Table method default aggregation type is the average of each grouping
Print (data.pivot_table ([' Tip ', ' size '], index=[' Day ', ' time '], columns= ' smoker '))
# The above process can also be achieved through GROUPBY
Print (Data.groupby ([' Day ', ' time ', ' smoker ')] [' Size ', ' Tip '].mean (). Unstack ())
# pivot_table () function in the keyword margins set to True to add a subtotal subtotal, including rows and columns
Print (' \ n ')
Print (data.pivot_table ([' Tip ', ' size '], index=[' Day ', ' time '], columns= ' smoker ', margins=true))
# pivot_table The default aggregate function is averaged and can be explicitly specified by the AGGFUNC keyword
Print (")
Print (data.pivot_table ([' Tip ', ' size '], index=[' Day ', ' time '], columns= ' smoker ', Margins=true, Aggfunc=sum))
# 4.2, cross-table (crosstab)
# crosstab is a special pivot table for calculating the frequency of groupings (frequencies)
Print (' \ n ')
Print (Pd.crosstab ([data[' time '], data[' Day '], data[' smoker '], margins=true))
Print ('--------------total time is%.5f s '% (Time.time ()-start))
# That's All
Python for data analysis, chapter Nineth, data aggregation and grouping operations