Summary One, create object two, view data three, select and set four, missing value processing Five, related Operations VI, aggregation seven, rearrangement (reshaping)
Viii. Time Series
Nine, categorical type ten, drawing
Xi. Import and save data content
# Coding=utf-8
import pandas as PD
import NumPy as NP
# # # One, create object
# 1. You can pass a list object to create a Series,pandas the integer index is created by default
s = PD. Series ([1, 3, 5, Np.nan, 6, 8])
# print S
# 2. Create a dataframe by passing a numpy array, a time index, and a column label
dates = Pd.date_range (' 20130101 ', periods=6)
# Print Dates
df = PD. DataFrame (NP.RANDOM.RANDN (6, 4), Index=dates, Columns=list ("ABCD"))
# Print DF
# 3. Create a dataframe by passing a Dictionary object that can be converted to a similar sequence structure
DF2 = PD. DataFrame ({"A": 1, "B": PD.) Timestamp (' 20130102 '), "C": PD. Series (1, Index=list (range (4)), dtype= "float32"),
"D": Np.array ([3] * 4, Dtype= "Int32"), "E": PD. Categorical (["Test", "Train", "Test", "Train"]),
"F": "foo"})
# print DF2
# # # Two, view data
# 1. View the row of headers and tails in frame, default 5 rows
# Print Df.head ()
# Print Df.tail (3)
# 2. Display indexes, columns, and underlying numpy data
# Print Df.index
# Print Df.columns
# Print Df.values
# # 3.describe () function for statistical summary of data, methods in Python cannot omit parentheses
# Print Df.describe ()
# 4. Transpose the data
# Print DF. T
# 5. Sort by axis (column)
# Print Df.sort_index (axis=1,ascending=false)
# 6. Sort_values (by=) is recommended for sorting by value
# Print Df.sort (columns= "B")
# Print Df.sort_values (by= "B")
# # # Three, select and set
# # get 1. Select a separate column, which will return a series, equivalent to DF. A
# print df["A"]
# # get 2. Select by [], which will slice the rows
# print Df[0:3][1:2]
# print Df[0:3]
# #上面的方法是通过下标 [] To access, below can be. loc[] To select a specific note
# #通过标签选择: 1. Use sticky notes to get a cross area
# Print df.loc[dates[0]]
# #通过标签选择: 2. Select on multiple axes with a label
# Print df.loc[:, ["A", "B"]]
# #通过标签选择: 3. Label slices
# print df.loc["20130102": "20130104", ["A", "B"]]
# #通过标签选择: 4. Dimension reduction for returned objects
# print df.loc["20130102", ["A", "B"]
# #通过标签选择: 5. Get a scalar
# Print df.loc[dates[0], "A"]
# #通过标签选择: 6. Quick access to a scalar (at method)
# Print df.at[dates[0], "A"]
# #通过位置选择: 1. Position selection by passing values (row selected)
# print Df.iloc[3]
# #通过位置选择: 2. Slice by value
# print Df.iloc[3:5,0:2]
# #通过位置选择: 3. By specifying a list of locations
# print df.iloc[[1,2,3],[0,2]]
# #通过位置选择: 4. Slicing rows
# Print Df.iloc[1:3,:]
# #通过位置选择: 5. Slicing a column
# print Df.iloc[:,1:3]
# #通过位置选择: 6. Get a specific value
# print df.iloc[1,1]
# print df.iat[1,1]
# #可以使用逻辑表达式来选择指定的数据框
# #布尔索引: 1. Use a single column value to select the data
# Print DF[DF. A > 0]
# #布尔索引: 2. Use the Where action to select the data
# print df[df > 0]
# #布尔索引: 3. Use the Isin () method to filter
# print df2[df2["E"].isin (["Test"])]
# #设置: Set a new set of values with a numpy array
# df.loc[:, "E"] = Np.array ([5]*len (DF))
# Print DF
# # Reindex Index Changes/additions/deletions (not assigned is Pd.nan)
df1 = Df.reindex (Index=dates[0:4], columns=list (df.columns) + ["E"])
# Print DF1
# # # Four, missing value processing (pandas use Np.nan instead of missing value, default is not calculated)
# 1. Remove rows that contain missing values
# Print Df1.dropna (how= "any")
# 2. Fill in missing values
# Print Df1.fillna (value=5)
# 3. Judging missing values
# Print pd.isnull ()
# # Five, related operations
# #apply (apply function to data)
# Print df.apply (np.cumsum) # #累积和
# Print df.apply (lambda x:x.max-x.min) # #x代表当前列的一个标量
# #值计数器
# Print s.value_counts ()
# #六, Aggregation (aggregate)
# # 1.contat (stitching, default is full-out)
# piece = [df[:2],df[2:4],df[4:]]
# Print Pd.concat (piece) # #默认axis =0 is connected up and down
# piece = [df.loc[:, ["A", "B"]],df.loc[:, ["C", "D"] ]
# Print Pd.concat (piece,axis=1) # #1是左右连接
# 2. Joint table operation (Join,merge)
# left = PD. DataFrame ({
# "Key": ["foo", "Foo1"], "lval": [+]
# } )
# right = PD. DataFrame ({
# "Key": ["foo", "Foo2"], "rval": [+]
# } )
# Print Pd.merge (left,right,how= "inner", Left_on=left.key,right_on=right.key) # #内联
# Print Pd.merge (left,right,how= "left", Left_on=left.key,right_on=right.key) # #左联
# Print Pd.merge (left,right,how= "right", Left_on=left.key,right_on=right.key) # #右联
# Print Pd.merge (left,right,how= "outer", Left_on=left.key,right_on=right.key) # #全外联
# Print Left.set_index ("key"). Join ([Right.set_index ("key")], how= "outer") # #join根据索引连接
# # 3.append (append)
# Print Df.append (OTHER=[DF,DF]) # #只能上下联接
# 4. Grouping
# Print Df.groupby ("A"). SUM ()
# Print Df.groupbyoupby (["A", "B"]). SUM () # #层次索引
# Print Df.groupby ([' A ', ' B ']) [' C '].mean ()
# Print Df.groupby (df["A"])
# # # VII, reshaping
# 1.Stack
# tuples = List (Zip (*[[' bar ', ' bar ', ' baz ', ' Baz ',
# ' foo ', ' foo ', ' Qux ', ' Qux ' ,
# [' One ', ' one ', ' one ', ' both ',
# ' One ', ' one ', ' one ', ' both '])
# index = PD. Multiindex.from_tuples (tuples, names=[' first ', ' second '])
# df = PD. DataFrame (NP.RANDOM.RANDN (8, 2), Index=index, columns=[' A ', ' B '])
# DF2 = Df[:4]
# print DF2
#The Stack function "Compresses" a level in the DataFrame ' s columns to produce either:
# A Series, in the case of A simple column Index
# A DataFrame, in the case of A Multiindex in the columns
# stacked = Df2.stack ()
# Print Stacked
# Print Stacked.unstack ()
# Print Stacked.unstack (1)
# Print Stacked.unstack (0)
# 2. Pivot Table
# Print pd.pivot_table (df,values= "D", index=["A", "B"],columns= "C")
# # # Eight, time series
# rng = Pd.date_range ("1/1/2012", periods=100, freq= "S")
# ts = pd. Series (NP.RANDOM.RANDN (0, Len (rng)), index=rng)
# Print Ts.resample ("5Min", how= "sum")
# # # Nine, categorical type
See here
# # 10, Paint
See here
# # 11, Import and save data
df.to_csv ("Data.csv")
csv = df.read_csv ("Data.csv")
Official documents here
Python Pandas use