Import NumPy as Npimport pandas as Pdimport Matplotlib.pyplot as Plt---------------numpy-----------------------arr = np.a Rray ([Np.zeros], Dtype=np.float64) ((3,6)) Np.empty ((2,3,2)) Np.arange () Arr.dtype Arr.ndim Arr.shapearr.astype (Np.int32) #np. Float64 np.string_ Np.unicode_arr * arr Arr-arr 1/arrarr= np.arange (+) reshape ((8,4 ) Arr[1:3,:] #正常切片arr [[+]] #花式索引arr. T Arr.transpose ((...)) Arr.swapaxes (...) #转置arr. Dot #矩阵内积np. sqrt (arr) np.exp (arr) Randn (8) # Normal distribution value np.maximum (x, y) np.where (cond, Xarr, Yarr) # when Cond is true, take xarr, otherwise take Yarrarr.mean () Arr.mean (Axis=1) #算术平均数arr. SUM () arr.std () Arr.var () #和, standard deviation, Variance arr.min () arr.m Ax () #最小值, Max Arr.argmin () Arr.argmax () #最小索引, Max index arr.cumsum () Arr.cumprod () #所有元素的累计和, Cumulative product Arr.all () Arr.any () # Check if the array is all true, partially true Arr.sort () arr.sort (1) #排序, 1 axes up arr.unique () #去重np. IN1D (arr1, arr2) #arr1的值是否在arr2中np. Load () n P.loadtxt () Np.save () Np.savez () # Read, save file np.concatenate ([arr, arr], Axis=1) # Connect two arr, in the direction of the row---------------Pandas-----------------------Ser = series () Ser = series ([...], index=[...]) #一维数组, dictionaries can be converted directly to Seriesser.values ser.index Ser.reindex ([...], fill_value=0) #数组的值, index of array, redefine index ser.isnull () pd.isn Ull (Ser) pd.notnull (Ser) #检测缺失数据ser. name= ser.index.name= #ser本身的名字, ser index name Ser.drop (' x ') #丢弃索引x对应的值ser +ser #算术运算ser. Sort_index () Ser.order () # Sort by index, sort by value df = DataFrame (data, columns=[...]) #表结构的数据结构, both row index and column index D f.ix[' x '] #索引为x的值 for series, direct use ser[' x ']del df[' ly ') #用del删除第ly列df. T #转置df. Index.name df.columns.name Df.valuesdf.drop ([...]) DF + DF Df1.add (DF2, fill_vaule=0) #算术运算df-ser #df与ser的算术运算f =lambda X:x.max ()-x.min () df.apply (f) df.sort_index (Axi S=1, Ascending=false) #按行索引排序df. Sort_index (by=[' A ', ' B ']) #按a, column B index sort ser.rank () Df.rank (Axis=1) #排序, add a rank value df.sum () Df.sum (Axis=1) #按列, Sum Df.mean (Axis=1, Skipna=false) #求各行的平均值, consider the presence of Na Df.idxmax () #返回最大值的索引df. Cumsum () #累计求和df. des Cribe () ser.describe () #返回count mean std min max equivalent Ser.unique () #去重ser. Value_counts () df.value_counts () # Returns a series whose index is a unique value, with a value of frequency ser.isin ([' X ', ' y ']) #判断ser的值是否为x, y , get Boolean value Ser.dropna () ser.isnull () Ser.notnull () Ser.fillna (0) #处理缺失数据, DF same df.unstack () #行列索引和值互换 df.unstack (). Stack () Df.swaplevel (' Key1 ', ' Key2 ') #接受两个级别编号或名称, and Interchange Df.sortlevel (1) #根据级别1进行排序, DF Row, column index can have two levels of df.set_index ([' C ', ' d '], drop= False) #将c, D two columns are converted to rows, because drop is false, the C,dread_csv read_table READ_FWF #读取文件分隔符为逗号 in the column, the delimiter is a tab (' \ t '), no delimiter (fixed column width) pd.read _csv (' ... ', nrows=5) #读取文件前5行pd. Read_csv (' ... ', chunksize=1000) #按块读取, avoid large files taking up memory pd.load () #pd也有load方法, used to read the binary file PD. Excelfile (' ... xls '). Parse (' Sheet1 ') # Read the Excel file in the Sheet1df.to_csv (' ... csv ', sep= ' | ', Index=false, Header=false) # Writes data to the CSV file, to | As a delimiter, by default, as delimiter, disable column, row label Pd.merge (DF1, DF2, on= ' key ', suffixes= (' _left ', ' _right ')) #合并两个数据集, database-like inner Join, with the key column in common, suffixes two keys named Key_left, Key_rightpd.merge (DF1, DF2, left_on= ' Lkey ', right_on= ' Rkey ') #合并, Inner joins of similar databases, but they do not have the same column names, respectively, as referenced pd.merge for merging (DF1, DF2, how= ' ouTer ') #合并, but is outer join;how= ' left ' is the Cartesian product, how= ' inner ' is ...; Multiple keys can also be combined Df1.join (DF2, on= ' key ', how= ' outer ') #也是合并pd. Concat ([Ser1, Ser2, Ser3], Axis=1) #连接三个序列, in the direction of the row Ser1.combine_ First (Ser2) Df1.combine_first (DF2) #把2合并到1上, and Align Df.stack () Df.unstack () #列旋转为行, row rotation to column df.pivot () df.duplicated () Df.drop_duplicates () #判断是否为重复数据, remove duplicate Data df['].map (lambda x:abs (x)) #将函数映射到df的指定列ser. Replace ( -999, Np.nan) # Replace-999 all with Nandf.rename (index={}, columns={}, Inplace=true) #修改索引, inplace to True indicates in-place modify DataSet Pd.cut (Ser, bins) # According to the surface element bin to determine which section of the SER's data belong to, there is labels, levels attribute df[(Np.abs (DF) >3). Any (1)] #输出含有 "more than 3 or 3" lines permutation take # Used for random reordering of pd.get_dummies (df[' key '), prefix= ' key ') #给df的所有列索引加前缀keydf [...]. Str.contains () df[...]. Str.findall (Pattern, flags=re. IGNORECASE) df[...]. Str.match (pattern, flags= ...) Df[...]. Str.get () #矢量化的字符串函数----Drawing Ser.plot () Df.plot () #pandas的绘图工具, parametric label, AX, style, alpha, kind, logy, use_index, Rot, XTick S, Xlim, grid, etc., see page257kind= ' kde ' #密度图kind = ' bar ' kind= ' Barh ' #垂直柱状图, horizontal histogram, StackeD=true for stacked plot ser.hist (bins=50) #直方图plt. Scatter (x, y) #绘制x, y-composed scatter plot pd.scatter_matrix (DF, diagonal= ' KDE ', color= ' K ', alpha = ' 0.3 ') #将df各列分别组合绘制散点图----aggregation grouping groupby () is grouped by default on the axis=0 axis, or it can be grouped on a group of 1; You can use for to group iterations df.groupby (df[' Key1 ') # The DF is grouped according to Key1 df[' Key2 '].groupby (df[' key1 ') #根据key1对key2列进行分组df [' Key3 '].groupby (df[' Key1 '], df[' Key2 ')) #先根据key1, The Key3 column is then grouped df[' Key2 '].groupby (df[' key1 ') according to Key2. Size () #size () returns a seriesdf.groupby (df[' Key1 ') that contains the packet size [' data1 '] Equivalent to df[' data1 '].groupby (df[' Key1 ')) df.groupby (df[' key1 ']) [[' Data1 ']] equivalent to df[[' data1 ']].groupby (df[' Key1 ')) Df.groupby (mapping, Axis=1) ser (mapping) #定义mapping字典, grouping df.groupby (len) #通过函数来进行分组 based on the grouping of dictionaries, such as according to the Len function Df.groupby ( Level= ' ... ', Axis=1) #根据索引级别来分组df. GroupBy ([], As_index=false) #禁用索引, returns data Df.groupby (...) in the form of no index. AGG ([' mean ', ' std ']) #一次使用多个聚合函数时, using the Agg method Df.groupby (...). Transform (Np.mean) #transform () can use its functions for each grouping df.groupby (). Apply () #apply方法会将待处理的对象拆分成多个片段, and then call the incoming function on each fragment. Finally try to combine each fragment together----perspective cross df.pivot_table ([' ', ' '], rows=[', '], cols= ', MargiNs=true) #margins为真时会加一列allpd. Crosstab (Df.col1, Df.col2, Margins=true) # Margins function Ibid.---------------matplotlib---------------fig=plt.figure () # Image of the Base Object Ax=fig.add_subplot (2,2,1) #2 Currently selected 1th Fig, axes = plt.subplots (nrows, Nclos, Sharex, Sharey) #创建图像, specify row, column, shared x-axis scale, shared y-axis scale plt.subplots_adjust (Left=none, Bottom=none, Right=none, Top=none, Wspace=none, Hspace=none) #调整subplot之间的距离, Wspace, hspace to control width, height percentage ax.plot (x, Y, Linestyle= '--', color= ' G ') #依据x, y-coordinate paint, set linetype, Color ax.set_xticks ([...]) ax.set_xticklabels ([...]) #设置x轴刻度ax. Set_xlabel (' ... ') #设置x轴名称ax. Set_title (' ... ') # Set the chart name Ax.legend (loc= ' best ') #设置图例, loc Specifies to place the legend in the appropriate position ax.text (x, y, ' hello ', family= ' Monospace ', fontsize=10) #将注释hello放在x, y, font size is 10ax.add_patch () #在图中添加块plt. Savefig (' ... png ', dpi=400, bbox_inches= ' Tight ') #保存图片, DPI is resolution, bbox=tight means that the blank portion------------------------------------------from Mpl_toolkits.basemap is trimmed Import Basemapimport Matplotlib.pyplot as plt# can be used to draw maps-----------------time series--------------------------Pd.to_ DateTime (DATESTRS)#将字符串型日期解析为日期格式pd. Date_range (' 1/1/2000 ', periods=1000) #生成时间序列ts. Resample (' D ', how= ' mean ') #采样, converts the time series to a fixed frequency per day, and calculates the mean value; how= ' OHLC ' is a stock of four indices; # resampling aggregates, the short frequency (day) becomes a long frequency (month), the corresponding value superimposed; # l sampling will interpolate, the long frequency becomes a short frequency, the middle produces a new value Ts.shift (2, freq= ' D ') ts.shift ( -2, freq= ' D ') #后移, move forward 2 days Now+day () now+monthend () Import Pytz pytz.timezone (' Us/eastern ') #时区操作, need to install PYTZPD. Period (' All ', freq= ' A-dec ') #period represents the time interval, called the period PD. Periodindex #时期索引ts. To_period (' M ') #时间转换为时期pd. Rolling_mean (...) PD.ROLLING_STD (...) #移动窗口函数-average, standard deviation
Python pandas NumPy matplotlib common methods and functions