Python Padas Learning

Source: Internet
Author: User

Importmatplotlib fromPandasImportDataFrameImportNumPy as NPImportPandas as PDImportMySQLdbImportMatplotlib.pyplot as Plt#DF =padaas Dataframe Object (two-dimensional tag array)#S =pandas Series object (one-dimensional tag array)db = MySQLdb.connect (host="localhost", port=3306, user="Root", passwd="1234", db='SPJ', charset="UTF8")#connecting to a databasefilename ='Count_day.csv'#File path namequery ='select * FROM J'#SQL query Statements #导入数据pd. Read_excel (filename) #从excel文件导入数据pd. read_table (filename) #和csv一样, you must delimit the delimiter Pd.read_json (json-string) # Import data from JSON-formatted string pd.read_html (URL) #解析url, HTML file, import the Tables table Pd.read_clipboard () #从粘贴板获取内容, pass it to Read_tables () # Import data from a SQL database Df=pd.read_sql (query,db) #从csv导入数据 #index_col The row index of the specified data as the dataframe of the rows is divided defaults think, separating df = Pd.read_csv ( Filename,sep= ', ', header=none,index_col=0) #导出数据df. To_csv (' ans.csv ', index=false,sep= ', ') #导出数据到csvdf. To_excel ( FileName) #导出数据到exceldf. To_sql (table_name,db) #导出数据到sqldf. To_json (filename) #以json格式导出数据到文本文件 # build Dataframe data yourself df = Pd. DataFrame ([[1,2,3],[4,5,6]],columns=[' F1 ', ' F2 ', ' F3 ']) #columns是列索引df = PD. DataFrame ({' User-id ': [' Line 1 ', ' Line 2 ', ' Line 3 '], ' item_id ': [' goods 1 ', ' Goods 2 ', ' Goods 3 ']}) #按列构造df = PD. DataFrame ([{' User-id ': ' User 1 ', ' item-id ': ' Product 1 '},{' user-id ': ' User 2 '}]) #按行构造 # view data show = Df.head (1) #head (n), view top n Rows show = Df.tail (1) #查看最后n列show = df.shape# View the number of rows and columns show = Df.info () #查看索引, data type and memory information show = Df.describe () #查看数值型列的汇总信息show = df[' Item-id '].value_counts (dropna=false) #查看Series对象的唯一值和计数show = df[' Item-id '].unique () #View series distinct values # data pick s = df[' User-id ']# (name) gets a column of data, the return value is SERIESDF = df[[' User-id ', ' Item-id ']]# (list) Select a few columns, return as Dataframeshow = S.iloc[0] #按位置选取数据 (Serieas can omit df[' user_id '][0]) show = df.iloc[0,0:2]# (Row,col) return the first row, from 0 to 2 columns show = s.loc[' Item-id ' #按索引选取数据show = Df.sample (frac=0.5) #按比例随机采样show = Df.sample (N=len (DF)) #指定采样个数" "#construct dataframe Data yourselfDF = PD. DataFrame ([[1,2,3],[4,5,6]],columns=['F1','F2','f3'])#columns is a column indexDF = PD. DataFrame ([{'User-id':'User 1','Item-id': 1},{'User-id':'User 2'}])#construct by RowDF = PD. DataFrame ({'user_id':['Line 1','Line 1','Line 3'],'item_id':['Item 1','Item 2','Item 3'],'test_id': [12,11,4]})#construct by Column" "#数据整理show = Df.isnull () #查看对象中的空值, returns a bool type show = Df.notnull () #查看对象中的非空值, returns a bool type show = Df.dropna (axis=0) # Delete all lines that contain spaces show = Df.dropna (axis=1) #删除所有包含空格的列show = Df.dropna (axis=0,thresh=2) #删除所有小于 (Thresh) a non-null value line show = Df.fillna ( ' Product ID ') #对所有空值填充, you can specify which columns (Df[].fillna ()) show = Df.fillna (Df.mode (). iloc[0]) #众值填充show = Df.fillna (Df.median ()) # The median fills show = df["Item-id"].fillna (' Fill ') #对某一列填充show = df["Item-id"].astype (float) #更改某一列的数据类型show = df["Item-id"]. Replace (1, ' one ') #用 ' one ' instead of all values equal to 1 show = Df.rename (Columns=lambda x:x+ "1") #批量更改列名show = Df.rename (Index=lambda x:x+ "1" ) #批量更改索引show = Df.rename (columns={' item-id ': ' Commodity ID '}) #选择性更改列名show = Df.set_index (' Item-id ', append=false) #将某一列变成索引, Append=true Add new Index DF1 = df.sample (n=1) DF2 = df.sample (frac=0.5) product Category DF3 = Pd.concat ([df1,df2]) #合并两个数据集show = df3.reset_ Index (Drop=false) #重置索引, Drop=true discard previous index # data processing show = df[df[' test_id ']>4] #选择 ' test_id ' column >4 line show = Df.sort_values (by= ' test_id ', ascending=true) #按照列 ' test_id ' sort, default ascending show = Df.sort_values (by=[' test_id ', ' item_id '],asCending=[true,false]) #先按列 ' test_id ' Row ascending, followed by ' item_id ' row descending show = Df.groupby (' user_id ') [' test_id '].apply (np.mean) #按 ' user_id ' test_id ' after the ' packet ', show = Df.pivot_table (index= ' user_id ', values=[' item_id ', ' test_id '],aggfunc=min) #按索引分组, values, the minimum value of each group show = Df.groupby (' user_id ', as_index=false) [' test_id '].agg ({' Product type ': ' Count ', ' Total merchandise ': ' Sum '}) #先分组, according to ' Test_ Id ' field to do statistics, as_index default will be based on the Grouping field as index show = df[' test_id '].apply (Np.mean) #对df中的每一列应用函数np. Meanshow = df.apply (Np.max,axis=1 ) #对df中的每一列应用函数np. Mean (mandatory type unification) for Index,row in Df.iterrows (): #index索引, row each row, output, select one or several of the fields output print (index,row[' use r_id '],row[' test_id ')) for KEY,DF in Df.groupby (' user_id '): #key = ' user_id ', DF is the data after grouping print (Key,len (DF), DF) #构造分组特征, to ' USER_ID ' group, on an S object, calculated rank (rank) #rank: Http://pandas.pydata.org/pandas-docs/stable/generated/pandas. Series.rank.htmlshow = Df.groupby (' user_id ', as_index=false) [' test_id '].rank (ascending=false,method= ' average ') # Data Merge df = Df1.append (DF2) #将df2中的行添加到df1的尾部df = Pd.concat ([Df1,df2],axis=1) #按列合并 (Axis=1), merge by Row (Axis=0) (both to ensure that the correspondingThe rows and columns are the same) df = Pd.merge (df1,df2,on= ' user_id ', how= ' inner ') #对df1, DF2 columns for SQL-like join Operations # Collection Difference Set calculation DF1 = PD. DataFrame ({' User ': [1,2,3,4], ' id ': [101,102,103,104]}) DF2 = PD. DataFrame ({' User ': [up]}) df2[' flag ']=1# set a flag column DF = pd.merge (df1,df2,on= ' user ', how= ' left ') #左连接df = df[ Df.flag.isnull ()].drop (' flag ', Axis=1) #对df1中出现的df2删除行, and delete ' flag ' column # statistic function show = Df.mean () #返回所有列的均值show = Df.corr () # Return column-listed relationship function show = Df.user_id.corr (df.test_id) #show = Df.count () #返回每一列非空值的个数show = Df.max () #每一列最大值show = Df.min () # The minimum value of each column is show = Df.median () #返回每一列的中位数show = DF.STD () #返回每一列的标准差show = Df.dtypes () #查看数据类型show = Df.isnull.sum () #查看每一列的空值个数" "#Paint FunctionZhfont = Matplotlib.font_manager. Fontproperties (fname=r'C:/WINDOWS/FONTS/FZYTK. TTF')#Chinese#Histogramhist = df['user_id'].hist ()#View variable DistributionsHist.plot ()#Line Charts = df['test_id']s.plot () df.plot ( )#Column ChartFig, axes = plt.subplots (2, 1) s= df['test_id']s.plot (Kind='Bar', Ax=axes[0], color='k', alpha=0.7) s.plot (Kind='Barh', Ax=axes[1], color='k', alpha=0.7) Plt.legend (prop=Zhfont) plt.show ()

Resources:

Http://pandas.pydata.org/pandas-docs/stable/api.html#binary-operator-functions
54410670

Python Padas Learning

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.