import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
Load the data and view its contents:
df = pd.read_excel("data/default-of-credit-card clients.xls", skiprows=1)
df.head(10)
Split the data set into 80% training and 20% testing portions:
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
Train the model using a binary classification algoritm:
param = {
"label": "y_train",
"max_depth": 2,
"eta": 0.3,
"silent": 1,
"booster": "gbtree",
"objective": "binary:logistic"}
num_round = 20
bst = xgb.train(param, dtrain, num_round)
Serialize the trained model to a file for later use:
bst.save_model("xgb.model")
Generate model predictions on the test data and calculate the accuracy of the model:
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])
print(precision_score(y_test, best_preds, average='macro', zero_division=0))
Let's generate a prediction for an account that we know has a good payment history:
input_data = np.array([35, 500000, 1, 1, 1, 58, -2, -2, -2, -2, -2, -2, 13709, 5006, 31130, 3180, 0, 5293, 5006, 31178, 3180, 0, 5293, 768]).reshape((1,-1))
input_df = pd.DataFrame(input_data, columns=df.columns[:-1])
dinput = xgb.DMatrix(input_df)
bst.predict(dinput)[0]
Let's generate a prediction for an account that we know defaults on their payment:
input_data = np.array([1, 20000, 2, 2, 1, 24, 2, 2, -1, -1, -2, -2, 3913, 3102, 689, 0, 0, 0, 0, 689, 0, 0, 0, 0]).reshape((1,-1))
input_df = pd.DataFrame(input_data, columns=df.columns[:-1])
dinput = xgb.DMatrix(input_df)
bst.predict(dinput)[0]