1 Import pandas as PD, NumPy as NP 2 dates = Pd.date_range ('20130101', periods=6)3 df = PD. DataFrame (Np.random.randn (6,4), Index=dates, Columns=list ('ABCD'))
1 mutate + IfElse
1df['E'] = Np.where (df['D'] >= 0,'>=0','<0')2df['F'] = Np.random.randint (0, 2, 6)3df['F'] = df['F'].apply (str)#for single-column4Df.applymap (str)#This is the equivalent of Mutate_each .
2 Table
1 pd.value_counts (df["E"])2 pd.pivot_table (df,index=[' E','F'])
3 index is the rownames to take DF, but unlike R, DF may have multidimensional rownames
1 Df.index2Df.set_index (['A'], drop = 0, append = 1)#set the existing column to index to retain the previous index, or you can delete the new index in the original data3df['dates'] = Df.index#freshmen into a row of dates4Df.reset_index (level=0, Inplace=true)#Ibid .5Df.reset_index (level=['Index'])#Ibid .
4 删除列和行
1DF = Df.drop ('Index', Axis = 1)#You can delete multiple columns2Df.drop (df.index[[1,3]])3Df.rename (columns = {'A':'AA','B':'BB','C':'cc','D':'DD','E':'ee','F':'FF'}, Inplace=True)4Df.rename (columns=LambdaX:x[1:].upper (), inplace=true)#You can also use anonymous functions
5 column names
1 Df.columns2Df.columns = ['a','b','C','e','D','F']# Renaming3Df.rename (columns = {'A':'AA','B':'BB','C':'cc','D':'DD','E':'ee','F':'FF'}, Inplace=True)4Df.rename (columns=LambdaX:x[1:].upper (), inplace=true)#You can also use the function inplace parameter to replace the original variable, the deep copy
6 Dummy Variable Dummy variables
1 PD. Series (['a|b'a|c']). Str.get_dummies ()
7 Pure DF Matrix, i.e. does not contain column and index
1 df.values 2 df.get_values ()
8 Summary
1 # calculations are only made for numeric variables
9 Rbind
1 df2=pd. DataFrame ([[5,6],[7,8]],columns=list ('AB')]2 df.append (DF2, Ignore_index=true)
group by group summary calculation, similar to pivot_table
1Df.groupby (['E','F']). Mean ()2Df.groupby (['E','F']). AGG (['sum','mean'])3Pd.pivot_table (df,index=['E','F'], aggfunc=[Np.sum, Np.mean])4Df.pivot_table (index=['E','F'], Aggfunc=[np.sum, Np.mean])#Ibid .5Df.groupby (['E','F']). AGG ({'A':['mean','sum'],'B':'min'})#GroupBy can also write this
11 sort
1 df.sort (['A','B'# Sort by column, Na_ Position controlling the location of Nan 2# Sort by index
12 filtering
1 # Value Filtering 2 df[df. E.str.contains (">"# contains a character, contains filter is actually a regular expression 3 df[df . F.isin (['1'# inside the list
13 Variable Selection
1df['A']#a single column2Df[0:3]#Line3df['20130102':'20130104']#Filter by index4df.loc[:,]#a method similar to Dataframe for selecting rows and columns in R5df.iloc[:,]#Iloc can only use numbers.
R to the day of Python (i)