This script reads SQL Server, just given the table name or view name, and if there is data, it will output each data distribution map that meets the requirements for each field.
#-*-coding:utf-8-*-#python 3.5.0#Exploratory Analytics (exploratory data Analysis,eda)__author__='HZC'ImportMathImportSQLAlchemyImportNumPy as NPImportPandas as PDImportMatplotlib.pyplot as PltclassEDA:def __init__(self,d): Self.engine= Sqlalchemy.create_engine ("Mssql+pymssql://%s:%[email protected]%s/%s"% (d['User'],d['pwd'],d['ins'],d['DB'])) defget_df_from_table (self,table_name): DF=pd.read_sql_table (table_name, self.engine)returnDFdefget_df_from_query (self,sql): DF=pd.read_sql_query (SQL, Self.engine)returnDF#Read the table field data types defGet_table_type (self,table_name): SQL="""Select C.name as colname,t.name as TypeName from Sys.sysobjects o inner joins syscolumns C on O.id=c.id and o.na Me<> ' dtproperties ' inner join sys.systypes t on C.xusertype=t.xusertype where o.name= '%s '"""%table_name DF=self.get_df_from_query (SQL)returnDF#Drawing defEda_plot (self,table_name): List_char= ['Char','nchar','varchar','nvarchar','text','ntext','sysname'] List_num= ['tinyint','smallint','int','Real',' Money','float','decimal','Numeric','smallmoney','bigint'] Df_type=Self.get_table_type (table_name) df_date=self.get_df_from_table (table_name) Date_count=Df_date.shape[0] k=0 forRowinchDf_type.itertuples (): K= k + 1#character types, drawing histogram ifRow.typenameinchList_char:col= Df_date.groupby ([Row.colname]). Agg ({row.colname:['Count']}) Row_count=Col.shape[0]#col_count = col.shape[1]Col =Col.sort_index () Val=col.values.tolist ()#only the number of repetitions is less than 5% ifMath.floor (Row_count*100/date_count) <5: Df_= PD. DataFrame (Col.index.values.tolist (), columns=[Row.colname]) df_['Count'] = List (i[0] forIinchval) x_axle=Range (len (df_[row.colname)) Y_axle= df_['Count'].tolist () X_label=df_[row.colname].tolist () FIG, ax=plt.subplots () Ax.bar (x_axle,y_axle) ax.set_xticks (x_axle) A X.set_xticklabels (X_label) ax.set_title ('table [%s]%s distribution'%(table_name,row.colname))#numeric type, other distribution map elifRow.typenameinchlist_num:df__=PD. DataFrame (Df_date[row.colname]) df__= df__[(Df__[row.colname].notnull ())].sort_values (Row.colname, ascending=true). Reset_index (drop=True) K= k + 1Plt.figure (k) Plt.subplot (1,3,1) plt.hist (Df__[row.colname]) Plt.subplot (1,3,2) Plt.boxplot (Df__[row.colname]) PLT.GCA (). Set_title ('table [%s]%s distribution'%(Table_name,row.colname)) Plt.subplot (1,3,3) Plt.violinplot (Df__[row.colname]) plt.tight_layout ()Else: Passplt.show ()if __name__=="__main__": Conn= {'User':'KK','pwd':'KK','ins':'HZC','DB':'Demo'} EDA=EDA (conn) Eda.eda_plot ("V_clientinfoall")
The display graph is divided into character (discrete) and numeric (continuous), the example results are as follows:
Python Exploratory Analytics (exploratory data Analysis,eda)