The following for you to share a Python data Analysis Library Pandas basic operation method, has a good reference value, I hope to help you. Come and see it together.
What is Pandas?
Is it it?
。。。。 Apparently pandas is not so cute as this guy ....
Let's take a look at how Pandas's official website defines itself:
Pandas is a open source, easy-to-use data structures and data analysis tools for the Python programming language.
Obviously, pandas is a very powerful data analysis library for Python!
Let's learn it!
1.pandas sequence
Import NumPy as NP import pandas as PD S_data = PD. Series ([1,3,5,7,NP. NAN,9,11]) #pandas中生产序列的函数, similar to what we normally say in the array print S_data
2.pandas Data Structure Dataframe
Import NumPy as NP import pandas as PD #以20170220为基点向后生产时间点 dates = pd.date_range (' 20170220 ', periods=6) #DataFrame生成函 Number, the row index is a point in time, and the column index is ABCD data = PD. DataFrame (Np.random.randn (6,4), index=dates,columns=list (' ABCD ')) Print data print print data.shape print print Data.values
Some of the 3.DataFrame operations (1)
Import NumPy as Npimport pandas as pd# design a dictionary d_data = {' A ': 1, ' B ':p d. Timestamp (' 20170220 '), ' C ': Range (4), ' D ': Np.arange (4)}print d_data# uses a dictionary to generate a dataframedf_data = PD. The type of each column in DataFrame (d_data) print df_data#dataframe print df_data.dtypes# prints a column of print df_data. a# Prints the B-column print Df_data. The type of the b#b column, print type, Df_data. B
Some of the 4.DataFrame operations (2)
Import NumPy as NP import pandas as PD dates = pd.date_range (' 20170220 ', periods=6) data = PD. DataFrame (Np.random.randn (6,4), index=dates,columns=list (' ABCD ')) Print data print #输出DataFrame头部数据, default to the first 5 lines of print Data.head () #输出输出DataFrame第一行数据 print data.head (1) #输出DataFrame尾部数据, default to the following 5 lines of print data.tail () #输出输出DataFrame最后一行数据 Print Data.tail (1) #输出行索引 print data.index #输出列索引 print data.columns #输出DataFrame数据值 print data.values #输出DataFrame详细信息 PR int Data.describe ()
Some of the 5.DataFrame operations (3)
Import NumPy as NP import pandas as PD dates = pd.date_range (' 20170220 ', periods=6) data = PD. DataFrame (Np.random.randn (6,4), index=dates,columns=list (' ABCD ')) Print data print #转置 print data. T #输出维度信息 print data.shape #转置后的维度信息 print data. T.shape #将列索引排序 Print data.sort_index (axis = 1) #将列索引排序 in descending order of print data.sort_index (axis = 1,ascending=false) #将行索引排序, descending Print data.sort_index (axis = 0,ascending=false) #按照A列的值进行升序排列 print data.sort_values (by= ' A ')
Some of the 6.DataFrame operations (4)
Import NumPy as NP import pandas as PD dates = pd.date_range (' 20170220 ', periods=6) data = PD. DataFrame (Np.random.randn (6,4), index=dates,columns=list (' ABCD ')) print data #输出A列 print data. A #输出A列 print data[' A '] #输出3, 4 lines print Data[2:4] #输出3, 4 lines print data[' 20170222 ': ' 20170223 '] #输出3, 4 lines print data.loc[' 20170222 ': ' 20170223 '] #输出3, 4 lines print Data.iloc[2:4] Output b,c two columns print data.loc[:,[' B ', ' C ']
Some of the 7.DataFrame operations (5)
Import NumPy as NP import pandas as PD dates = pd.date_range (' 20170220 ', periods=6) data = PD. DataFrame (Np.random.randn (6,4), index=dates,columns=list (' ABCD ')) print data #输出A列中大于0的行 print Data[data. A > 0] #输出大于0的数据, less than or equal to 0 with a Nan complement print Data[data > 0] #拷贝data data2 = data.copy () print data2 tag = [' A '] * 2 + [' B '] * 2 + [' C '] * 2 #在data2中增加TAG列用tag赋值 data2[' tag ' = tag print data2 #打印TAG列中为a, C's line print data2[data2. Tag.isin ([' A ', ' C '])]
Some of the 8.DataFrame operations (6)
Import NumPy as NP import pandas as PD dates = pd.date_range (' 20170220 ', periods=6) data = PD. DataFrame (Np.random.randn (6,4), index=dates,columns=list (' ABCD ')) print data #将第一行第一列元素赋值为100 data.iat[0,0] = 100 The Print Data #将A列元素用range (6) assigns the value data. A = Range (6) Print data #将B列元素赋值为200 data. B = Print data #将3, 4 column element assigned to Data.iloc[:,2:5] = Print data
Some of the 9.DataFrame operations (7)
Import NumPy as NP import pandas as PD dates = pd.date_range (' 20170220 ', periods = 6) df = PD. DataFrame (Np.random.randn (6,4), index = dates, columns = list (' ABCD ')) print DF #重定义索引, and add the e-column DFL = df.reindex (index = da Tes[0:4],columns = List (df.columns) +[' e ']) print DFL #将E列中的2, 3 rows are assigned 2 dfl.loc[dates[1:3], ' E '] = 2 print DFL #去掉存在NaN元素的行 pri NT Dfl.dropna () #将NaN元素赋值为5 print Dfl.fillna (5) #判断每个元素是否为NaN print pd.isnull (DFL) #求列平均值 print Dfl.mean () #对每列进行累加 Print Dfl.cumsum ()
Some of the 10.DataFrame operations (8)
Import NumPy as NP import pandas as PD dates = Pd.date_range (' 20170220 ', periods = 6) df = PD. DataFrame (Np.random.randn (6,4), index = dates, columns = list (' ABCD ')) print df DFL = df.reindex (index = Dates[0:4],colu MNS = List (df.columns) +[' E ']) print DFL #针对行求平均值 print Dfl.mean (axis=1) #生成序列并向右平移两位 s = PD. Series ([1,3,5,np.nan,6,8],index = dates). Shift (2) print S #df与s做减法运算 print df.sub (s,axis = ' index ') #每列进行累加运算 print Df.app Ly (np.cumsum) #每列的最大值减去最小值 print df.apply (lambda X:x.max ()-x.min ())
Some of the 11.DataFrame operations (9)
Import NumPy as NP import pandas as PD dates = Pd.date_range (' 20170220 ', periods = 6) df = PD. DataFrame (Np.random.randn (6,4), index = dates, columns = list (' ABCD ')) print DF #定义一个函数 def _sum (x): print (Type (x)) C1/>return x.sum () #apply函数可以接受一个函数作为参数 print df.apply (_sum) s = PD. Series (np.random.randint (10,20,size =)) print s #统计序列中每个元素出现的次数 print s.value_counts () #返回出现次数最多的元素 print S.mode ()
Some of the 12.DataFrame operations (10)
Import NumPy as NP import pandas as PD DF = PD. DataFrame (Np.random.randn (10,4), columns = List (' ABCD ')) print df #合并函数 DFL = Pd.concat ([Df.iloc[:3],df.iloc[3:7], Df.iloc[7:]) print DFL #判断两个DataFrame中元素是否相等 print df = = DFL
Some of the 13.DataFrame operations (11)
Import NumPy as Npimport pandas as Pddf = PD. DataFrame (Np.random.randn (10,4), columns = List (' ABCD ')) print Dfleft = PD. DataFrame ({' key ': [' foo ', ' foo '], ' lval ': [Up]}) right = PD. DataFrame ({' key ': [' foo ', ' foo '], ' rval ': [4,5]}) print Leftprint right# merge data via key print Pd.merge (left,right,on= ' key ') s = PD. Series (Np.random.randint (1,5,size = 4), index = list (' ABCD ')) print s# adds a line of print df.append (S,ignore_index = True) by sequence
Some of the 14.DataFrame operations (12)
Import NumPy as NP import pandas as PD df = PD. DataFrame ({' A ': [' foo ', ' Bar ', ' foo ', ' Bar ', ' foo ', ' Bar ' , ' foo ', ' Bar '], ' B ': [' one ', ' one ', ' I ', ' three ', ' One ', ' one ', ' one ', ' three '], ' C ': Np.random.randn (8), ' D ': Np.random.randn (8)}) print DF Print # Sum print df.groupby (' a ') according to the index of column A. SUM () print #先根据A列的索引, summing print df.groupby ([' A ', ' B ']) on the index of column B. SUM () print #先根据B列的索引, The sum of print df.groupby ([' B ', ' A ']) is summed according to the index of column A. SUM ()
Some of the 15.DataFrame operations (13)
Import pandas as PD import NumPy as np #zip函数可以打包成一个个tuple tuples = List (Zip (*[[' bar ', ' bar ', ' baz ', ' Baz ', ' foo ', ' fo O ', ' qux ', ' Qux '], [' One ', ' one ', ' one ', ' one ', ' one ', ' one ', ' one ', ' one ']]) print tuples #生成一个多层索引 index = PD.M Ultiindex.from_tuples (tuples, names=[' first ', ' second ']) print index print DF = PD. DataFrame (NP.RANDOM.RANDN (8, 2), Index=index, columns=[' A ', ' B ']) print DF print #将列索引变成行索引 print df.stack ()
Some of the 16.DataFrame operations (14)
Import pandas as PD import NumPy as np tuples = List (Zip (*[[' bar ', ' bar ', ' baz ', ' Baz ', ' foo ', ' foo ', ' Qux ', ' Qux ') ], [' One ', ' one ', ' one ', ' one ', ' one ', ' one ', ' one ', ' one ', ' ' one ']])-- index = PD. Multiindex.from_tuples (tuples, names=[' first ', ' second ']) df = PD. DataFrame (NP.RANDOM.RANDN (8, 2), Index=index, columns=[' A ', ' B ']) print df Print stacked = Df.stack () print stacked #将行索引转 Change to column index print stacked.unstack () #转换两次 print Stacked.unstack (). Unstack ()
Some of the 17.DataFrame operations (15)
Import pandas as PD import NumPy as NP df = PD. DataFrame ({' A ': [' one ', ' one ', ' one ', ' three '] * 3, ' B ': [' A ', ' B ', ' C '] * 4, ' C ': [' foo ', ' foo ', ' foo ', ' Bar ', ' Bar ', ' Bar '] * 2, ' d ': np.random.randn (+), ' E ': Np.random.randn ()}) print DF #根据A, b index is row, C's index is column processing D value print p D.pivot_table (DF, values= ' D ', index=[' A ', ' B '], columns=[' C ')) #感觉A列等于one为索引, print df[df According to the average of the C-column combination. a== ' One '].groupby (' C '). Mean ()
18. Time Series (1)
Import pandas as PD import NumPy as NP #创建一个以20170220为基准的以秒为单位的向前推进600个的时间序列 rng = Pd.date_range (' 20170220 ', periods= Freq= ' s ') print rng #以时间序列为索引的序列 print PD. Series (np.random.randint (0, Len (rng)), index=rng)
19. Time Series (2)
Import pandas as PD import NumPy as np rng = pd.date_range (' 20170220 ', periods=600, freq= ' s ') ts = pd. Series (np.random.randint (0, Len (rng)), index=rng) #重采样, add and sample print ts.resample (' 2Min ', how= ' sum ') in 2-minute units # List 2011 1 quarter to 2017 1 quarter rng1 = Pd.period_range (' 2011q1 ', ' 2017q1 ', freq= ' Q ') print rng1 #转换成时间戳形式 print rng1.to_timestamp () # Time plus subtract print PD. Timestamp (' 20170220 ')-PD. Timestamp (' 20170112 ') print PD. Timestamp (' 20170220 ') + PD. Timedelta (days=12)
20. Data categories
Import pandas as PD import NumPy as NP df = PD. DataFrame ({"id": [1,2,3,4,5,6], "Raw_grade": [' a ', ' B ', ' B ', ' A ', ' a ', ' E ']}) print DF #添加类别数据, with the value of Raw_grade as the category base df["Grade "] = df[" Raw_grade "].astype (" category ") print DF #打印类别 print df[" Grade "].cat.categories #更改类别 df[" Grade "]. Cat.categories = ["Very good", "good", "very bad"] print df #根据grade的值排序 print df.sort_values (by= ' grade ', ascending=true) #根据grade排序显示数量 print Df.groupby ("grade"). Size ()
21. Visualization of data
Import pandas as PD import NumPy as NP import Matplotlib.pyplot as PLT ts = pd. Series (Np.random.randn (), Index=pd.date_range (' 20170220 ', periods=1000)) ts = ts.cumsum () print TS Ts.plot () Plt.show ()
22. Data read and Write
Import pandas as PD import NumPy as NP df = PD. DataFrame (NP.RANDOM.RANDN (4), columns=list (' ABCD ')) #数据保存, relative path Df.to_csv (' data.csv ') #数据读取 print pd.read_csv (' Data.csv ', index_col=0)
The data is saved in this file:
Open to see: