Importmatplotlib fromPandasImportDataFrameImportNumPy as NPImportPandas as PDImportMySQLdbImportMatplotlib.pyplot as Plt#DF =padaas Dataframe Object (two-dimensional tag array)#S =pandas Series object (one-dimensional tag array)db = MySQLdb.connect (host="localhost", port=3306, user="Root", passwd="1234", db='SPJ', charset="UTF8")#connecting to a databasefilename ='Count_day.csv'#File path namequery ='select * FROM J'#SQL query Statements #导入数据pd. Read_excel (filename) #从excel文件导入数据pd. read_table (filename) #和csv一样, you must delimit the delimiter Pd.read_json (json-string) # Import data from JSON-formatted string pd.read_html (URL) #解析url, HTML file, import the Tables table Pd.read_clipboard () #从粘贴板获取内容, pass it to Read_tables () # Import data from a SQL database Df=pd.read_sql (query,db) #从csv导入数据 #index_col The row index of the specified data as the dataframe of the rows is divided defaults think, separating df = Pd.read_csv ( Filename,sep= ', ', header=none,index_col=0) #导出数据df. To_csv (' ans.csv ', index=false,sep= ', ') #导出数据到csvdf. To_excel ( FileName) #导出数据到exceldf. To_sql (table_name,db) #导出数据到sqldf. To_json (filename) #以json格式导出数据到文本文件 # build Dataframe data yourself df = Pd. DataFrame ([[1,2,3],[4,5,6]],columns=[' F1 ', ' F2 ', ' F3 ']) #columns是列索引df = PD. DataFrame ({' User-id ': [' Line 1 ', ' Line 2 ', ' Line 3 '], ' item_id ': [' goods 1 ', ' Goods 2 ', ' Goods 3 ']}) #按列构造df = PD. DataFrame ([{' User-id ': ' User 1 ', ' item-id ': ' Product 1 '},{' user-id ': ' User 2 '}]) #按行构造 # view data show = Df.head (1) #head (n), view top n Rows show = Df.tail (1) #查看最后n列show = df.shape# View the number of rows and columns show = Df.info () #查看索引, data type and memory information show = Df.describe () #查看数值型列的汇总信息show = df[' Item-id '].value_counts (dropna=false) #查看Series对象的唯一值和计数show = df[' Item-id '].unique () #View series distinct values # data pick s = df[' User-id ']# (name) gets a column of data, the return value is SERIESDF = df[[' User-id ', ' Item-id ']]# (list) Select a few columns, return as Dataframeshow = S.iloc[0] #按位置选取数据 (Serieas can omit df[' user_id '][0]) show = df.iloc[0,0:2]# (Row,col) return the first row, from 0 to 2 columns show = s.loc[' Item-id ' #按索引选取数据show = Df.sample (frac=0.5) #按比例随机采样show = Df.sample (N=len (DF)) #指定采样个数" "#construct dataframe Data yourselfDF = PD. DataFrame ([[1,2,3],[4,5,6]],columns=['F1','F2','f3'])#columns is a column indexDF = PD. DataFrame ([{'User-id':'User 1','Item-id': 1},{'User-id':'User 2'}])#construct by RowDF = PD. DataFrame ({'user_id':['Line 1','Line 1','Line 3'],'item_id':['Item 1','Item 2','Item 3'],'test_id': [12,11,4]})#construct by Column" "#数据整理show = Df.isnull () #查看对象中的空值, returns a bool type show = Df.notnull () #查看对象中的非空值, returns a bool type show = Df.dropna (axis=0) # Delete all lines that contain spaces show = Df.dropna (axis=1) #删除所有包含空格的列show = Df.dropna (axis=0,thresh=2) #删除所有小于 (Thresh) a non-null value line show = Df.fillna ( ' Product ID ') #对所有空值填充, you can specify which columns (Df[].fillna ()) show = Df.fillna (Df.mode (). iloc[0]) #众值填充show = Df.fillna (Df.median ()) # The median fills show = df["Item-id"].fillna (' Fill ') #对某一列填充show = df["Item-id"].astype (float) #更改某一列的数据类型show = df["Item-id"]. Replace (1, ' one ') #用 ' one ' instead of all values equal to 1 show = Df.rename (Columns=lambda x:x+ "1") #批量更改列名show = Df.rename (Index=lambda x:x+ "1" ) #批量更改索引show = Df.rename (columns={' item-id ': ' Commodity ID '}) #选择性更改列名show = Df.set_index (' Item-id ', append=false) #将某一列变成索引, Append=true Add new Index DF1 = df.sample (n=1) DF2 = df.sample (frac=0.5) product Category DF3 = Pd.concat ([df1,df2]) #合并两个数据集show = df3.reset_ Index (Drop=false) #重置索引, Drop=true discard previous index # data processing show = df[df[' test_id ']>4] #选择 ' test_id ' column >4 line show = Df.sort_values (by= ' test_id ', ascending=true) #按照列 ' test_id ' sort, default ascending show = Df.sort_values (by=[' test_id ', ' item_id '],asCending=[true,false]) #先按列 ' test_id ' Row ascending, followed by ' item_id ' row descending show = Df.groupby (' user_id ') [' test_id '].apply (np.mean) #按 ' user_id ' test_id ' after the ' packet ', show = Df.pivot_table (index= ' user_id ', values=[' item_id ', ' test_id '],aggfunc=min) #按索引分组, values, the minimum value of each group show = Df.groupby (' user_id ', as_index=false) [' test_id '].agg ({' Product type ': ' Count ', ' Total merchandise ': ' Sum '}) #先分组, according to ' Test_ Id ' field to do statistics, as_index default will be based on the Grouping field as index show = df[' test_id '].apply (Np.mean) #对df中的每一列应用函数np. Meanshow = df.apply (Np.max,axis=1 ) #对df中的每一列应用函数np. Mean (mandatory type unification) for Index,row in Df.iterrows (): #index索引, row each row, output, select one or several of the fields output print (index,row[' use r_id '],row[' test_id ')) for KEY,DF in Df.groupby (' user_id '): #key = ' user_id ', DF is the data after grouping print (Key,len (DF), DF) #构造分组特征, to ' USER_ID ' group, on an S object, calculated rank (rank) #rank: Http://pandas.pydata.org/pandas-docs/stable/generated/pandas. Series.rank.htmlshow = Df.groupby (' user_id ', as_index=false) [' test_id '].rank (ascending=false,method= ' average ') # Data Merge df = Df1.append (DF2) #将df2中的行添加到df1的尾部df = Pd.concat ([Df1,df2],axis=1) #按列合并 (Axis=1), merge by Row (Axis=0) (both to ensure that the correspondingThe rows and columns are the same) df = Pd.merge (df1,df2,on= ' user_id ', how= ' inner ') #对df1, DF2 columns for SQL-like join Operations # Collection Difference Set calculation DF1 = PD. DataFrame ({' User ': [1,2,3,4], ' id ': [101,102,103,104]}) DF2 = PD. DataFrame ({' User ': [up]}) df2[' flag ']=1# set a flag column DF = pd.merge (df1,df2,on= ' user ', how= ' left ') #左连接df = df[ Df.flag.isnull ()].drop (' flag ', Axis=1) #对df1中出现的df2删除行, and delete ' flag ' column # statistic function show = Df.mean () #返回所有列的均值show = Df.corr () # Return column-listed relationship function show = Df.user_id.corr (df.test_id) #show = Df.count () #返回每一列非空值的个数show = Df.max () #每一列最大值show = Df.min () # The minimum value of each column is show = Df.median () #返回每一列的中位数show = DF.STD () #返回每一列的标准差show = Df.dtypes () #查看数据类型show = Df.isnull.sum () #查看每一列的空值个数" "#Paint FunctionZhfont = Matplotlib.font_manager. Fontproperties (fname=r'C:/WINDOWS/FONTS/FZYTK. TTF')#Chinese#Histogramhist = df['user_id'].hist ()#View variable DistributionsHist.plot ()#Line Charts = df['test_id']s.plot () df.plot ( )#Column ChartFig, axes = plt.subplots (2, 1) s= df['test_id']s.plot (Kind='Bar', Ax=axes[0], color='k', alpha=0.7) s.plot (Kind='Barh', Ax=axes[1], color='k', alpha=0.7) Plt.legend (prop=Zhfont) plt.show ()
Resources:
Http://pandas.pydata.org/pandas-docs/stable/api.html#binary-operator-functions
54410670
Python Padas Learning