Taking the German credit data as an example, the logistict regression algorithm is used to achieve the principle of credit score card, so the feature selection is not considered.
First step: Import the necessary libraries
Import pandas as Pdimport NumPy as Npfrom sklearn.cross_validation import train_test_split
Step Two: Import data
German = Pd.read_csv (' d:/creditdatasets/german.data ', sep= ', header=none) german.columns = [' Status_of_existing_ Checking_account ', ' duration_in_month ', ' credit_history ', ' Purpose ', ' credit_amount ', ' savings_account ', ' Present_ ' Employment_since ', ' installment_rate ', ' personal_status_and_sex ', ' other_debtors ', ' present_residence_since ', ' Property ', ' age ', ' Other_installment_plans ', ' Housing ', ' number_of_existing_credits ', ' Job ', ' number_of_people ', ' Telephone ', ' foreign_worker ', ' default ']GRP = German.groupby (' default ') Total_good = Grp.size () [1]total_bad = Grp.size ( ) [2]
The third step: calculate the woe value of the nominal variable and the numerical variable respectively, the numerical variable with less value is also realized by the nominal variable woe calculation method, the rest of the numerical variables are 5 equal.
Def calcwoe (VarName): &NBSP;&NBSP;&NBSP;&NBSP;WOE_MAP&NBSP;=&NBSP;PD. DataFrame () vars = np.unique (German[varname]) for v in vars: tmp = german[varname] == v grp = german[tmp].groupby (' Default ') good = grp.size () [1] bad = grp.size () [2] good_ratio = float (good)/total_good bad_ratio = float (bad)/total_bad woe = np.log (Bad_ratio/good_ratio) IV = (Bad_ratio - good_ratio) *woe &NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;&NBSP;RESULT&NBSP;=&NBSP;PD. DatafraMe ([[varname, v, woe, iv]], index=none, columns=[' Variable ', ' class ', ' WOE ' , ' IV ']) woe_map = woe_map.append (result, Ignore_index=true) return woe_map# nominal variable woestatus_ Checking_account_woe = calcwoe (' Status_of_existing_checking_account ') Credit_history_woe = calcwoe (' credit_history ') Purpose_woe = calcwoe (' Purpose ') Savings_account_woe = calcwoe (' Savings_account ') Present_employment_since_woe= calcwoe (' present_employment_since ') personal_status_and_sex_woe = Calcwoe (' Personal_status_and_sex ') other_debtors_woe = calcwoe ('Other_debtors ') property_woe = calcwoe (' property ') Other_installment_plans_woe = calcwoe (' Other_installment _plans ') housing_woe = calcwoe (' Housing ') job_woe = calcwoe (' Job ') telephone_woe = calcwoe (' Telephone ') Foreign_worker_woe = calcwoe (' Foreign_worker ') # numeric variable woe, no binninginstallment_rate_woe = calcwoe (' installment_rate ') present_residence_since_woe = calcwoe (' Present_ Residence_since ') number_of_Existing_credits_woe = calcwoe (' number_of_existing_credits ') number_of_people_woe = calcwoe (' Number_of_people ') def calcwoe_bin (VarName,N): &NBSP;&NBSP;WOE_MAP&NBSP;=&NBSP;PD. DataFrame () max_value = max (German[varname]) min_value = min (German[varname]) bin = float (max_value - min_value )/n for i in range (N): bin_u = min_value + (i+1) *bin bin_l = bin_U - bin if i == 1: tmp = (german[varname] >= bin_l) & (German[varname] <= bin_u) grp = german[tmp].groupby (' Default ') else: tmp = (german[varname] > bin_l) & (German[varname] <= bin_u) grp = german[tmp].groupby (' Default ') good = grp.size () [1] bad = grp.size () [2] Good_ratio = float (good)/total_good bad_ratio = float (Bad)/total_bad woe = np.log (bad_ratio/ Good_ratio) IV = (bad_ratio - good_ratio) * Woe &nbSP;&NBSP;&NBSP;&NBSP;RESULT&NBSP;=&NBSP;PD. DataFrame ([[varname, [bin_l, bin_u, woe], woe, iv]], index=none, columns=[' variable ', ' class+woe ', ' woe ', ' IV ']) woe_map = woe_map.append (result, ignore_index=true) return woe_mapduration_in_month_woe = calcwoe_bin (' Duration_in_month ', 5) credit_amount_woe = calcwoe_bin (' Credit_amount ', &NBSP;5) age_woe = calcwoe_bin (' age ', 5)
Fourth step: Replace the original value with the woe value
Def replacewoe (Varname, sourcedf, varwoe): dict1 = Dict.fromkeys (varwoe[' class ') j = 0 for key in dict1: dict1[key] = varwoe[' Woe '][j] j = j + 1 sourcedf[ Varname] = sourcedf[varname].map (Dict1) return sourcedfgerman_woe = germantemp = replacewoe (' Status_of_existing_checking_account ', german_woe, status_ Checking_account_woe) Temp1 = replacewoe (' Credit_history ', temp, credit_history_woe) Temp = replacewoe (' Purpose ', temp1, purpose_woe) temp1 = replacewoe (' Savings_account ') , temp, savings_account_woe) Temp = replacewoe (' Present_employment_since ', &NBSP;TEMP1, present_employment_since_woe) Temp1 = replacewoe (' Personal_status_and_sex ', temp, personal_status_and_sex_woe) temp = replacewoe (' other_debtors ', temp1, other_debtors_woe) Temp1 = replacewoe (' property ', temp, property_woe) Temp = replacewoe (' Other_installment_plans ', temp1, Other_ Installment_plans_woe) Temp1 = replacewoe (' Housing ', temp, housing_woe) temp = Replacewoe (' Job ', temp1, job_woe) temp1 = replacewoe (' Telephone ', temp, telephone _woe) Temp = replacewoe (' Foreign_worker ', temp1, foreign_worker_woe) temp1 = Replacewoe (' installment_rate ', temp, installment_rate_woe) temp = replacewoe (' Present_ Residence_since ', temp1, present_residence_since_woe) temp1 = replacewoe (' Number_of_ Existing_credits ', temp, number_of_existing_credits_woe) temp = replacewoe (' Number_of_ People ', &NBSP;TEMP1,&NBSP;NUMBER_OF_PEOPLE_woe) Def replacewoe_bin (Varname, sourcedf, varwoe): items = Np.unique (Sourcedf[varname]) m = min (Sourcedf[varname]) dict2 = {} for it in items: if it == m: dict2[it] = varwoe[' Class+woe '][0][2] else: for l, u, w in varwoe[' Class+woe ']: if (it > l) & (it <= u): dict2[it ] = w sourcedf[varname] = sourcedf[varname].map (DICT2) return Sourcedftemp1 = replacewoe_bin (' Duration_in_month ', temp, duration_in_month_woe) temp = replacewoe_bin (' Credit_amount ', temp1, credit_amount_woe) Temp1 = replacewoe_bin (' Age ', temp, age_woe)
Fifth step: Split the DataSet into training and test sets
x = Temp1[list (temp1.columns) [: -1]]y = temp1[' default ']-1x_train, X_test, y_train, y_test = Train_test_split (x, Y, Test_ size=0.1, Random_state=0)
Sixth step: Applying the logistic regression algorithm to the training set
From sklearn.linear_model.logistic Import logisticregressionclassifier = Logisticregression () classifier.fit (x_train , y_train) predictions = classifier.predict (x_test)
Seventh Step: Evaluate model classification accuracy
From sklearn.metrics import accuracy_score# print ' accuracy: ', Accuracy_score (y_test, predictions) from Sklearn.cross_ Validation import cross_val_scorescores = Cross_val_score (classifier, X_train, Y_train, cv=5) # print Np.mean (scores), Scores
Eighth step: Create a scorecard
# score = a - b*log (theta) # p0 = a - b*log (THETA0), P0 + pdo = a - b*log (2*THETA0) p0 = 600pdo = 20theta0 = 1.0/60b = pdo/np.log (2) a = p0 + b*np.log (THETA0) coef = classifier.coef_beta0 = classifier.intercept_status_checking_account_woe[' score '] = (A &NBSP;-&NBSP;B*BETA0)/20 - b*coef[0][0]*status_checking_account_woe[' Woe ']Duration_in_month_woe[' Score '] = (a - b*beta0)/20 - b*coef[0][1]*duration_in_month_woe[' Woe ']Credit_ history_woe[' score '] = (a - b*beta0)/20 - b*coef[0][2]*credit_history_woe[' woe '] purpose_woe[' score '] = (a - b*beta0)/20 - b*coef[0][3]*purpose_woe[' Woe ']Credit_ amount_woe[' score '] = (a - b*beta0)/20 - b*coef[0][4]*credit_amount_woe[' woe '] savings_account_woe[' score '] = (A&NBSP;-&NBSP;B*BETA0)/20 - b*coef[0][5]*savings_account_woe[' Woe ']present_employment_since_ woe[' score '] = (a - b*beta0)/20 - b*coef[0][6]*present_employment_since_woe[' woe ']installment_rate_woe[' score '] = (A&NBSP;-&NBSP;B*BETA0)/20 - b*coef[0][7]*installment_ rate_woe[' woe ']personal_status_and_sex_woe[' score '] = (A&NBSP;-&NBSP;B*BETA0)/20 - B*coef[ 0][8]*personal_status_and_sex_woe[' woe ']other_debtors_woe[' score '] = (A&NBSP;-&NBSP;B*BETA0)/20 - b*coef[0][9]*other_debtors_woe[' woe ']present_residence_since_woe[' score '] = (A - &NBSP;B*BETA0)/20 - b*coef[0][10]*present_residence_since_woe[' woe ']property_woe[' score '] = (A&NBSP;-&NBSP;B*BETA0)/20 - b*coef[0][11]*property_woe[' woe ']age_woe[' score '] = (A &NBSP;-&NBSP;B*BETA0)/20 - b*coef[0][12]*age_woe[' woe ']other_installment_plans_woe[' score '] = (A - b*beta0)/20 - b*coef[0][13]*other_installment_plans_woe[' woe ']housing_woe[' score '] = (A - &NBSP;B*BETA0)/20 - b*coef[0][14]*housing_woe[' woe ']number_of_existing_credits_woe[' score '] = (A&NBSP;-&NBSP;B*BETA0)/20 - b*coef[0][15]*number_of_existing_credits_woe[' woe ']Job_woe[' score '] = (a - b*beta0)/20 - b*coef[0][16]*job_woe[' woe ']number_of_people_woe[' score ' ] = (a - b*beta0)/20 - b*coef[0][17]*number_of_people_woe[' Woe ']Telephone_woe[' Score '] = (a - b*beta0)/20 - b*coef[0][18]*telephone_woe[' Woe ']foreign_worker_ woe[' score '] = (a - b*beta0)/20 - b*coef[0][19]*foreign_worker_woe[' woe ']
Use Python for the first time, and don't hesitate to criticize it!
This article is from the "Senior Learning Corner" blog, please be sure to keep this source http://4292565.blog.51cto.com/4282565/1861560
Practice of Logistics Regression algorithm on scorecard based on German credit data