Kaggle

Code
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport joblibimport shapimport timeimport shapfrom datetime import datefrom dateutil.relativedelta import relativedeltafrom sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCVfrom sklearn.compose import ColumnTransformerfrom sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelBinarizerfrom sklearn.pipeline import Pipelinefrom sklearn.tree import DecisionTreeClassifierfrom xgboost import XGBClassifierfrom xgboost import DMatrixfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import classification_report
Code
class Config():  def __init__(self):    """    Initialization calss.    """    self.path="/kaggle/working/"    self.file="/kaggle/input/credit-scoring/credit_scoring.csv"    self.model_result_path="/kaggle/working/"    self.random_state=42    self.col_binary = ['code_gender', 'flag_own_car', 'flag_own_realty']    self.col_ordinal = ["name_income_type", "name_education_type", "name_family_status", "name_housing_type", "occupation_type"]    self.col_numeric = ['cnt_children', 'amt_income_total', 'cnt_fam_members']    self.features = self.col_binary + self.col_ordinal + self.col_numericconfig = Config()

Import

Code
df = pd.read_csv(config.file)df.columns = df.columns.str.lower()df["id"] = df.id.astype(int)
Code
df = df[df.id != 6392180]df = df[~df.occupation_type.isna()]

Split

Code
x = df[config.features]y = df.targetxtrain, xtest, ytrain, ytest = train_test_split(    x,    y,    test_size = .2,    stratify = y,    random_state=config.random_state)

Decision Tree Classifier

Code
ohe = OneHotEncoder(drop='if_binary', sparse_output=False)oe = OrdinalEncoder().set_output(transform='pandas')ss = StandardScaler()dtc = DecisionTreeClassifier(class_weight="balanced", random_state=config.random_state)preprocessor = ColumnTransformer(  transformers=[    ("binary_encoder", ohe, config.col_binary),    ("ordinal_encoder", oe, config.col_ordinal),    ("standard_scaler", ss, config.col_numeric)    ],    remainder='passthrough'    )pipe_dtc = Pipeline(  steps=[    ("preprocessor", preprocessor),    ("dtc", dtc)    ]    )param_grid_dtc = {  "dtc__criterion" : ["gini", "entropy", "log_loss"],  "dtc__splitter" : ["best", "random"],  "dtc__max_depth" : [1, 2, 5, 10, 15]}scoring = {    "accuracy": "accuracy",    "f1": "f1",    "roc_auc": "roc_auc"}grid_search_dtc = GridSearchCV(    estimator=pipe_dtc,    param_grid=param_grid_dtc,    cv=7,    scoring=scoring,    refit="f1",  # This will refit the model using the accuracy metric    n_jobs=-1)

Fit

Code
gs_dtc = grid_search_dtc.fit(xtrain, ytrain)

Pickle

Code
joblib.dump(gs_dtc, config.model_result_path + 'credit_score_grid_search_dtc_f1.pkl')
['/kaggle/working/credit_score_grid_search_dtc_f1.pkl']
Code
gs_dtc = joblib.load(config.model_result_path + 'credit_score_grid_search_dtc_f1.pkl')dtc_model = gs_dtc.best_estimator_

Predict & Classification Report

Code
ypred_dtc = dtc_model.predict(xtest)cr = classification_report(  ytest,  ypred_dtc,  target_names=['0', '1'],  digits=4,  output_dict=True  )df_cr = pd.DataFrame.from_dict(cr).reset_index()df_cr
index 0 1 accuracy macro avg weighted avg
0 precision 0.989836 0.303270 0.763001 0.646553 0.916371
1 recall 0.742224 0.936394 0.763001 0.839309 0.763001
2 f1-score 0.848331 0.458157 0.763001 0.653244 0.806581
3 support 41986.000000 5031.000000 0.763001 47017.000000 47017.000000

Random Forest Classifier

Code
# classifierrfc = RandomForestClassifier(class_weight="balanced", random_state=config.random_state)# pipelinepipe_rfc = Pipeline(  steps=[    ("preprocessor", preprocessor),    ("rfc", rfc)    ]    )# parametersparam_dist_rfc = {    'rfc__n_estimators': [50, 75, 100],    'rfc__max_depth': [5, 10, 15],    'rfc__min_samples_split': [5, 10]}# grid searchgrid_search_rfc = GridSearchCV(    estimator=pipe_rfc,    param_grid=param_dist_rfc,    cv=7,    scoring=scoring,    refit="f1",    n_jobs=-1)

Fit

Code
start_time = time.time()rs_rfc = grid_search_rfc.fit(xtrain, ytrain)end_time = time.time()elapsed_time = end_time - start_timeprint(f"Elapsed time: {elapsed_time} seconds")print(f"Elapsed time: {elapsed_time / 60} minutes")
Elapsed time: 336.4672429561615 seconds
Elapsed time: 5.607787382602692 minutes

Pickle

Code
import picklewith open(config.model_result_path + "credit_score_grid_search_rfc_f1.pkl", "wb") as f:  pickle.dump(rs_rfc, f)
Code
with open(config.model_result_path + "credit_score_grid_search_rfc_f1.pkl", "rb") as f:  gs_rfc = pickle.load(f)rfc_model = gs_rfc.best_estimator_

Predict

Code
ypred_rfc = rfc_model.predict(xtest)cr_rfc = classification_report(  ytest,  ypred_rfc,  # target_names=config.labels,  digits=4,  output_dict=True  )df_cr_rfc = pd.DataFrame.from_dict(cr_rfc).reset_index()df_cr_rfc
index 0 1 accuracy macro avg weighted avg
0 precision 0.992212 0.326011 0.784503 0.659112 0.920926
1 recall 0.764683 0.949911 0.784503 0.857297 0.784503
2 f1-score 0.863715 0.485424 0.784503 0.674569 0.823236
3 support 41986.000000 5031.000000 0.784503 47017.000000 47017.000000

XGBoost Classifier

Pipeline

Code
# classifierscale_pos_weight = len(y[y == 0]) / len(y[y == 1])xgb = XGBClassifier(    n_jobs=-1,    enable_categorical=True,    scale_pos_weight=scale_pos_weight,    random_state=config.random_state)# pipelinepipe_xgb = Pipeline(  steps=[    ("preprocessor", preprocessor),    ("xgb", xgb)    ]    )# parametersparam_dist_xgb = {  "xgb__n_estimators" : [100, 150, 200, 300],  "xgb__max_depth" : [3, 5, 7, 10],  "xgb__learning_rate" : [0.1, 0.01, 0.001, 0.0001],  "xgb__subsample": [0.7, 0.8, 0.9],  "xgb__colsample_bytree": [0.7, 0.8, 0.9],  "xgb__gamma": [0, 0.1],  "xgb__alpha": [0, 0.1],  # Adding slight L1 regularization for simplicity  "xgb__lambda": [1, 2]    # Adding slight L2 regularization for stability}# scoringscoring = {    "accuracy": "accuracy",    "f1": "f1",    "roc_auc": "roc_auc"}# random search cvrandom_search_xgb = RandomizedSearchCV(    estimator=pipe_xgb,    param_distributions=param_dist_xgb,    n_iter=30,  # Set the number of parameter combinations to try    cv=7,    scoring=scoring,    refit="f1",    n_jobs=-1    )

Fit

Code
start_time = time.time()rs_xgb = random_search_xgb.fit(xtrain, ytrain)end_time = time.time()elapsed_time = end_time - start_timeprint(f"Elapsed time: {elapsed_time} seconds")print(f"Elapsed time: {elapsed_time / 60} minutes")
Elapsed time: 255.91241335868835 seconds
Elapsed time: 4.2652068893114725 minutes

Pickle

Code
with open(config.model_result_path + "credit_score_random_search_xgb_f1.pkl", "wb") as f:  pickle.dump(rs_xgb, f)
Code
with open(config.model_result_path + "credit_score_random_search_xgb_f1.pkl", "rb") as f:  rs_xgb = pickle.load(f)xgb_model = rs_xgb.best_estimator_xgb_params = rs_xgb.best_params_

Predict

Code
ypred_xgb = xgb_model.predict(xtest)cr_xgb = classification_report(  ytest,  ypred_xgb,  # target_names=config.labels,  digits=4,  output_dict=True  )df_cr_xgb = pd.DataFrame.from_dict(cr_xgb).reset_index()df_cr_xgb
index 0 1 accuracy macro avg weighted avg
0 precision 0.991184 0.318273 0.777676 0.654728 0.919180
1 recall 0.757776 0.943749 0.777676 0.850763 0.777676
2 f1-score 0.858905 0.476014 0.777676 0.667460 0.817934
3 support 41986.000000 5031.000000 0.777676 47017.000000 47017.000000

Plot

Code
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_scorefrom sklearn.model_selection import cross_val_scorefrom scipy.stats import ttest_rel, wilcoxonimport matplotlib.pyplot as plt
Code
plt.figure(figsize=(20, 15))model_names = ["dtc", "rfc", "xgb"]models = [dtc_model, rfc_model, xgb_model]for i, model in enumerate(models):    fpr, tpr, _ = roc_curve(ytest, model.predict_proba(xtest)[:, 1])    plt.plot(fpr, tpr, label=f'{model_names[i]} (AUC: {roc_auc_score(ytest, model.predict_proba(xtest)[:, 1]):.2f})')plt.xlabel("False Positive Rate")plt.ylabel("True Positive Rate")plt.title("ROC Curves Comparison")plt.legend()plt.show()plt.close()

Explainability

Code
preprocessor = xgb_model.named_steps["preprocessor"]xgb_clf = xgb_model.named_steps["xgb"]idx = [15454, 1284, 30305]
Code
xtest.iloc[idx[0]]
code_gender                                        M
flag_own_car                                       Y
flag_own_realty                                    Y
name_income_type                             Working
name_education_type    Secondary / secondary special
name_family_status                           Married
name_housing_type                    Co-op apartment
occupation_type                             Laborers
cnt_children                                       1
amt_income_total                            216000.0
cnt_fam_members                                  3.0
Name: 226874, dtype: object
Code
# retain features namextrain_processed = preprocessor.transform(xtrain)xtrain_processed = pd.DataFrame(xtrain_processed, columns=config.features)# retain features namextest_processed = preprocessor.transform(xtest)xtest_processed = pd.DataFrame(xtest_processed, columns=config.features)# retain features namexgb_clf.get_booster().feature_names = config.features# convert to DMatrixdtrain = DMatrix(xtrain_processed, label=ytrain)dtest = DMatrix(xtest_processed, label=ytest)# usefol idxsidx = [15454, 1284, 30305]x_processed = preprocessor.transform(x)

## Features importance

Code
from xgboost import plot_importanceplot_importance(    booster=xgb_clf.get_booster(),    grid=False,    importance_type="gain",    title="Feature Importance by Gain",    values_format="{v:.2f}")plt.savefig("plot_importance.jpeg", dpi=200)plt.show()plt.close()

Explain predictions

explainer = shap.TreeExplainer(xgb_clf)shap_values = explainer.shap_values(x_processed)

with open(config.model_result_path + “explainer.pkl”, “wb”) as f: pickle.dump(explainer, f)with open(config.model_result_path + “shap_values.pkl”, “wb”) as f: pickle.dump(shap_values, f)

Code
with open(config.model_result_path + "explainer.pkl", "rb") as f:  explainer = pickle.load(f)with open(config.model_result_path + "shap_values.pkl", "rb") as f:  shap_values = pickle.load(f)

Force Plot

Code
plt.figure(figsize=(10, 8))plot_force = shap.plots.force(    base_value=explainer.expected_value,    shap_values=shap_values[idx[0], :],    # features=None,    feature_names=config.features,    # out_names=None,    # link='identity', # "logit"    plot_cmap='RdBu',    matplotlib=True,    show=True)plt.savefig("plot_force.jpeg", dpi=200)plt.close()
<Figure size 1000x800 with 0 Axes>

Code
explainer_ = shap.Explainer(model=xgb_clf)shap_values_ = explainer(x_processed)shap_values_.feature_names = config.features
Code
shap.plots.waterfall(    shap_values_[idx[0]],    max_display=len(config.features))plt.savefig("plot_waterfall.jpeg", dpi=200)plt.close()

Code
shap.plots.bar(    shap_values_[idx[0]],    max_display=len(config.features))plt.savefig("plot_bar.jpeg", dpi=200)plt.close()

Code